## Data Preparation
Let's load this data into Python.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column
ratings = pd.read_csv('ratings.csv', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv('users.csv', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [2]:
print(movies.shape)

(100, 3)


## Content-Based Recommendation Model

Computes similarity between movies based on movie genres. It will suggest movies that are most similar to a particular movie based on its genre.

In [3]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
print(movies.shape)


(100, 3)


In [4]:
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"['Animation', ""Children's"", 'Comedy']"
1,2,Jumanji (1995),"['Adventure', ""Children's"", 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama']"
4,5,Father of the Bride Part II (1995),['Comedy']


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(100, 44)

In [6]:
tfidf_matrix

<100x44 sparse matrix of type '<class 'numpy.float64'>'
	with 231 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#cosine_sim[:4, :4]
cosine_sim

array([[ 1.        ,  0.15337409,  0.12551391, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.15337409,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.12551391,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.25861841],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.25861841,
         0.        ,  1.        ]])

In [8]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

#TODO: Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
        
    return titles[score_series.index[1:]]

Let's try and get the top recommendations for a few movies and see how good the recommendations are.

In [9]:
genre_recommendations('Toy Story (1995)').head(20)

12                                         Balto (1995)
86                             Dunston Checks In (1996)
33                                          Babe (1995)
47                                    Pocahontas (1995)
51                              Mighty Aphrodite (1995)
95                        In the Bleak Midwinter (1995)
62    Don't Be a Menace to South Central While Drink...
4                    Father of the Bride Part II (1995)
64                                      Bio-Dome (1996)
68                                        Friday (1995)
18                Ace Ventura: When Nature Calls (1995)
87                                   Black Sheep (1996)
37                                  It Takes Two (1995)
7                                   Tom and Huck (1995)
1                                        Jumanji (1995)
55                       Kids of the Round Table (1995)
71                         Kicking and Screaming (1995)
74                                     Big Bully

## Collaborative Filtering Recommendation Model


Use the file **ratings.csv** first as it contains User ID, Movie IDs and Ratings. These three elements are all needed for determining the similarity of the users based on their ratings for a particular movie.


In [10]:
# Fill NaN values in user_id and movie_id column with 0
ratings['user_id'] = ratings['user_id'].fillna(0)
ratings['movie_id'] = ratings['movie_id'].fillna(0)

# Replace NaN values in rating column with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

Take a random sample of 20,000 ratings (2%) (due to limitation of personal laptop)

In [11]:
# Randomly sample 1% of the ratings dataset
small_data = ratings.sample(frac=0.02)
# Check the sample info
print(small_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 2781 to 1405
Data columns (total 3 columns):
user_id     150 non-null int64
movie_id    150 non-null int64
rating      150 non-null int64
dtypes: int64(3)
memory usage: 4.7 KB
None


In [12]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(small_data, test_size=0.2)

In [13]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])
test_data_matrix = test_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix[:4, :4])

(120, 3)
[[26 63  5]
 [53 57  3]
 [ 3 64  4]
 [66 43  4]]


In [20]:
print(ratings.shape)
print(ratings.head(5))

(7512, 3)
   user_id  movie_id  rating
0        1         1       3
1        1         3       5
2        1         4       4
3        1         5       3
4        1         6       3


In [35]:
1 - pairwise_distances(train_data_matrix.T, metric='correlation')

array([[ 1.        , -0.0027764 ,  0.17886317],
       [-0.0027764 ,  1.        , -0.21075292],
       [ 0.17886317, -0.21075292,  1.        ]])

Now I use the **pairwise_distances** function from sklearn [Pearson Correlation Coefficient](https://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity). This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array.

In [15]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[ 1.          0.72329023  0.95281888  0.368579  ]
 [ 0.72329023  1.          0.89877197  0.90851698]
 [ 0.95281888  0.89877197  1.          0.6333582 ]
 [ 0.368579    0.90851698  0.6333582   1.        ]]


In [16]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.         -0.0027764   0.17886317]
 [-0.0027764   1.         -0.21075292]
 [ 0.17886317 -0.21075292  1.        ]]


In [24]:
user_correlation[0]

array([ 1.        ,  0.72329023,  0.95281888,  0.368579  ,  0.96847351,
        0.9341167 ,  0.9586902 ,  0.86894762, -0.00716502, -0.05882966,
        0.55891052,  0.96855523,  0.50213017,  0.99894009,  0.98026905,
        0.94036234,  0.98250409,  0.71583578,  0.39991918,  0.85743236,
       -0.1719014 ,  0.09629558,  0.32564186,  0.98242496,  0.79054865,
        0.96768449,  0.97906829, -0.0445691 ,  0.87582989,  0.41036869,
        0.75568298,  0.77795389, -0.05016808,  0.17225581,  0.0055357 ,
        0.71490552,  0.97755404,  0.25951408,  0.67789965,  0.99979012,
        0.76862471,  0.42081217,  0.99330945,  0.1611823 ,  0.98180887,
        0.84715615,  0.99823489,  0.50574087,  0.93831943,  0.96000732,
        0.94127972,  0.62877999,  0.97773686,  0.88718325,  0.67250901,
        0.92326786,  0.71904396,  0.99927757,  0.46090518, -0.16526317,
        0.89615357,  0.79693137,  0.368579  ,  0.96959495,  0.32401391,
        0.94400044,  0.98934362,  0.97137818,  0.99173918, -0.06

In [32]:
item_correlation

array([[ 1.        , -0.0027764 ,  0.17886317],
       [-0.0027764 ,  1.        , -0.21075292],
       [ 0.17886317, -0.21075292,  1.        ]])

With the similarity matrix in hand, I can now predict the ratings that were not included with the data. Using these predictions, I can then compare them with the test data to attempt to validate the quality of our recommender model.

For the user-user CF case, I will look at the similarity between 2 users (A and B, for example) as weights that are multiplied by the ratings of a similar user B (corrected for the average rating of that user).

In [37]:
#TODO: Function to predict ratings
def user_mean(user_id):
    user_rating = []
    for row in ratings:
        if row[1] == user_id:
            user_rating.append(row[2])
    return np.mean(user_rating)

def predict(ratings, similarity, type='user'):
    pred = np.zeros(ratings.shape)
    if type == 'user':
        for i, rating in enumerate(ratings):
            user_id = rating[0]
            movie_id = rating[1]
            
            u_mean = user_mean(user_id)
            
            pred_rating = u_mean + ( (np.sum(np.dot(similarity[i][i:], np.subtract(rating[rating[i] != user_id][movie_id], user_mean(rating[i]))))) 
                             / (np.sum(similarity[i][i:])) )
            
            pred[i] = [user_id, movie_id, pred_rating]
#     elif type == 'item':
#         for i, ratingin enumerate(ratings):
            
    return pred

In [38]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    print(pred)
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [39]:
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
# item_prediction = predict(train_data_matrix, item_correlation, type='item')

# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
# print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


IndexError: index 54 is out of bounds for axis 0 with size 0