# Neural Network 
#### Implementation of neural network used for Movie Ratings Data
Used on the MovieLens 1M dataset

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Loading the data
columns_user = ['User_ID','Gender','Age','Occupation','Zip_code']
columns_movies = ['Movie_ID', 'Title', 'Genre']
columns_ratings = ['User_ID', 'Movie_ID', 'Rating', 'Timestamp']
user_data = pd.read_csv('users.dat', sep='::', header=None, names=columns_user)
movie_data = pd.read_csv('movies.dat', sep='::', header=None, names=columns_movies)
ratings_data = pd.read_csv('ratings.dat', sep='::', header=None, names=columns_ratings)
print(user_data.head())
print(movie_data.head())
print(ratings_data.head())

# Data mapping for gender



   User_ID Gender  Age  Occupation Zip_code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455
   Movie_ID                               Title                         Genre
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
   User_ID  Movie_ID  Rating  Timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291


In [57]:
class Network(object):

    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            test_data=None):
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: {1} / {2}".format(
                    j, self.evaluate(test_data), n_test))
            else:
                print("Epoch {0} complete".format(j))

    def update_mini_batch(self, mini_batch, eta):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1])
        return (nabla_b, nabla_w)

    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        return (output_activations-y)

def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))


In [3]:
# Separting the genres
def separate_data(genre):
    s = str(genre).split('|')
    return s
    
movie_data['Genre'] = movie_data['Genre'].apply(separate_data)
print(movie_data.head())

   Movie_ID                               Title  \
0         1                    Toy Story (1995)   
1         2                      Jumanji (1995)   
2         3             Grumpier Old Men (1995)   
3         4            Waiting to Exhale (1995)   
4         5  Father of the Bride Part II (1995)   

                              Genre  
0   [Animation, Children's, Comedy]  
1  [Adventure, Children's, Fantasy]  
2                 [Comedy, Romance]  
3                   [Comedy, Drama]  
4                          [Comedy]  


In [4]:
# Merging the movies and the ratings dataset
movie_data = movie_data.merge(ratings_data, on=['Movie_ID'])
print(movie_data.head())

   Movie_ID             Title                            Genre  User_ID  \
0         1  Toy Story (1995)  [Animation, Children's, Comedy]        1   
1         1  Toy Story (1995)  [Animation, Children's, Comedy]        6   
2         1  Toy Story (1995)  [Animation, Children's, Comedy]        8   
3         1  Toy Story (1995)  [Animation, Children's, Comedy]        9   
4         1  Toy Story (1995)  [Animation, Children's, Comedy]       10   

   Rating  Timestamp  
0       5  978824268  
1       4  978237008  
2       4  978233496  
3       5  978225952  
4       5  978226474  


In [5]:
# Merging the resulting data set with user_id
movie_data = movie_data.merge(user_data, on=['User_ID'])
print(movie_data.head())

   Movie_ID                                      Title  \
0         1                           Toy Story (1995)   
1        48                          Pocahontas (1995)   
2       150                           Apollo 13 (1995)   
3       260  Star Wars: Episode IV - A New Hope (1977)   
4       527                    Schindler's List (1993)   

                                       Genre  User_ID  Rating  Timestamp  \
0            [Animation, Children's, Comedy]        1       5  978824268   
1  [Animation, Children's, Musical, Romance]        1       5  978824351   
2                                    [Drama]        1       5  978301777   
3       [Action, Adventure, Fantasy, Sci-Fi]        1       4  978300760   
4                               [Drama, War]        1       5  978824195   

  Gender  Age  Occupation Zip_code  
0      F    1          10    48067  
1      F    1          10    48067  
2      F    1          10    48067  
3      F    1          10    48067  
4      F 

In [6]:
# Mapping the gender data to 0s and 1s
gender = {
    'M':0,
    'F':1
}

movie_data['Gender'] = movie_data['Gender'].map(gender)

In [7]:
print(movie_data.head())

   Movie_ID                                      Title  \
0         1                           Toy Story (1995)   
1        48                          Pocahontas (1995)   
2       150                           Apollo 13 (1995)   
3       260  Star Wars: Episode IV - A New Hope (1977)   
4       527                    Schindler's List (1993)   

                                       Genre  User_ID  Rating  Timestamp  \
0            [Animation, Children's, Comedy]        1       5  978824268   
1  [Animation, Children's, Musical, Romance]        1       5  978824351   
2                                    [Drama]        1       5  978301777   
3       [Action, Adventure, Fantasy, Sci-Fi]        1       4  978300760   
4                               [Drama, War]        1       5  978824195   

   Gender  Age  Occupation Zip_code  
0       1    1          10    48067  
1       1    1          10    48067  
2       1    1          10    48067  
3       1    1          10    48067  
4   

In [8]:
# Dropping the title, timestamp and zipcode data
movie_data.drop(['Title','Zip_code','Timestamp'], inplace=True, axis=1)

In [9]:
print(movie_data.tail(10))

         Movie_ID                       Genre  User_ID  Rating  Gender  Age  \
1000199      3408                     [Drama]     5727       5       0   25   
1000200      3409           [Drama, Thriller]     5727       4       0   25   
1000201      3481                    [Comedy]     5727       4       0   25   
1000202      3483     [Animation, Children's]     5727       3       0   25   
1000203      3484                  [Thriller]     5727       1       0   25   
1000204      3513           [Drama, Thriller]     5727       4       0   25   
1000205      3535  [Comedy, Horror, Thriller]     5727       2       0   25   
1000206      3536           [Comedy, Romance]     5727       5       0   25   
1000207      3555          [Action, Thriller]     5727       3       0   25   
1000208      3578             [Action, Drama]     5727       5       0   25   

         Occupation  
1000199           4  
1000200           4  
1000201           4  
1000202           4  
1000203           4 

In [10]:
# Segregation of genre data
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western"
    ]

for g in genres:
    movie_data[g] = 0
    
print(movie_data.head())

   Movie_ID                                      Genre  User_ID  Rating  \
0         1            [Animation, Children's, Comedy]        1       5   
1        48  [Animation, Children's, Musical, Romance]        1       5   
2       150                                    [Drama]        1       5   
3       260       [Action, Adventure, Fantasy, Sci-Fi]        1       4   
4       527                               [Drama, War]        1       5   

   Gender  Age  Occupation  Action  Adventure  Animation   ...     Fantasy  \
0       1    1          10       0          0          0   ...           0   
1       1    1          10       0          0          0   ...           0   
2       1    1          10       0          0          0   ...           0   
3       1    1          10       0          0          0   ...           0   
4       1    1          10       0          0          0   ...           0   

   Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  Thriller  War  \
0     

In [11]:
for i,row in movie_data.iterrows():
    genre = row['Genre']
    for g in genre:
        movie_data.set_value(i,g,1)

print(movie_data.head())

   Movie_ID                                      Genre  User_ID  Rating  \
0         1            [Animation, Children's, Comedy]        1       5   
1        48  [Animation, Children's, Musical, Romance]        1       5   
2       150                                    [Drama]        1       5   
3       260       [Action, Adventure, Fantasy, Sci-Fi]        1       4   
4       527                               [Drama, War]        1       5   

   Gender  Age  Occupation  Action  Adventure  Animation   ...     Fantasy  \
0       1    1          10       0          0          1   ...           0   
1       1    1          10       0          0          1   ...           0   
2       1    1          10       0          0          0   ...           0   
3       1    1          10       1          1          0   ...           1   
4       1    1          10       0          0          0   ...           0   

   Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  Thriller  War  \
0     

In [12]:
movie_data.drop(['Genre'], inplace=True, axis=1)
print(movie_data.head())

   Movie_ID  User_ID  Rating  Gender  Age  Occupation  Action  Adventure  \
0         1        1       5       1    1          10       0          0   
1        48        1       5       1    1          10       0          0   
2       150        1       5       1    1          10       0          0   
3       260        1       4       1    1          10       1          1   
4       527        1       5       1    1          10       0          0   

   Animation  Children's   ...     Fantasy  Film-Noir  Horror  Musical  \
0          1           1   ...           0          0       0        0   
1          1           1   ...           0          0       0        1   
2          0           0   ...           0          0       0        0   
3          0           0   ...           1          0       0        0   
4          0           0   ...           0          0       0        0   

   Mystery  Romance  Sci-Fi  Thriller  War  Western  
0        0        0       0         0    0  

In [13]:
print(movie_data.mean())

Movie_ID       1865.539898
User_ID        3024.512348
Rating            3.581564
Gender            0.246389
Age              29.738314
Occupation        8.036138
Action            0.257403
Adventure         0.133925
Animation         0.043284
Children's        0.072171
Comedy            0.356505
Crime             0.079524
Documentary       0.007908
Drama             0.354455
Fantasy           0.036293
Film-Noir         0.018257
Horror            0.076370
Musical           0.041524
Mystery           0.040170
Romance           0.147492
Sci-Fi            0.157261
Thriller          0.189640
War               0.068513
Western           0.020679
dtype: float64


In [43]:
# Create a pivot table to better visualize the data
table = pd.pivot_table(movie_data, index=['User_ID', 'Movie_ID'])
user_1 = table.loc[1]
user_2 = table.loc[2]
user_1 = user_1.reset_index()
user_2 = user_2.reset_index()


print(user_1['Movie_ID'] == user_2['Movie_ID'])

# Calculate the euclidean distance between 

ValueError: Can only compare identically-labeled Series objects

In [39]:
print(user_1.head())
print(user_2.head())

   Movie_ID  Action  Adventure  Age  Animation  Children's  Comedy  Crime  \
0         1       0          0    1          1           1       1      0   
1        48       0          0    1          1           1       0      0   
2       150       0          0    1          0           0       0      0   
3       260       1          1    1          0           0       0      0   
4       527       0          0    1          0           0       0      0   

   Documentary  Drama   ...     Horror  Musical  Mystery  Occupation  Rating  \
0            0      0   ...          0        0        0          10       5   
1            0      0   ...          0        1        0          10       5   
2            0      1   ...          0        0        0          10       5   
3            0      0   ...          0        0        0          10       4   
4            0      1   ...          0        0        0          10       5   

   Romance  Sci-Fi  Thriller  War  Western  
0        0 