## Data Preparation
Let's load this data into Python.

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column
ratings = pd.read_csv('ratings.csv', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv('users.csv', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [7]:
print(movies.shape)

(100, 3)


## Content-Based Recommendation Model

Computes similarity between movies based on movie genres. It will suggest movies that are most similar to a particular movie based on its genre.

In [17]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
print(movies.shape)
movies

(100, 3)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]
5,6,Heat (1995),"[Action, Crime, Thriller]"
6,7,Sabrina (1995),"[Comedy, Romance]"
7,8,Tom and Huck (1995),"[Adventure, Children's]"
8,9,Sudden Death (1995),[Action]
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]"


In [18]:
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"['Animation', ""Children's"", 'Comedy']"
1,2,Jumanji (1995),"['Adventure', ""Children's"", 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama']"
4,5,Father of the Bride Part II (1995),['Comedy']


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(100, 44)

In [20]:
tfidf_matrix

<100x44 sparse matrix of type '<class 'numpy.float64'>'
	with 231 stored elements in Compressed Sparse Row format>

In [35]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#cosine_sim[:4, :4]
cosine_sim[0]

array([1.        , 0.15337409, 0.12551391, 0.13826443, 0.27581218,
       0.        , 0.12551391, 0.21631224, 0.        , 0.        ,
       0.        , 0.09046197, 0.81915922, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.27581218, 0.        ,
       0.0902357 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.58207107, 0.        ,
       0.        , 0.        , 0.27581218, 0.12551391, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.13826443,
       0.        , 0.        , 0.50018952, 0.        , 0.        ,
       0.        , 0.27581218, 0.        , 0.        , 0.        ,
       0.15337409, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.27581218, 0.12551391, 0.27581218,
       0.        , 0.        , 0.12551391, 0.27581218, 0.05254773,
       0.        , 0.13826443, 0.        , 0.        , 0.13826

In [81]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

#TODO: Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations_tmp(title):
    sim = pd.Series(cosine_sim[indices[title]]).sort_values(ascending=False)
    
    return titles[sim[1:].index]

In [82]:
genre_recommendations_tmp('Toy Story (1995)')

12                                         Balto (1995)
86                             Dunston Checks In (1996)
33                                          Babe (1995)
47                                    Pocahontas (1995)
51                              Mighty Aphrodite (1995)
95                        In the Bleak Midwinter (1995)
62    Don't Be a Menace to South Central While Drink...
4                    Father of the Bride Part II (1995)
64                                      Bio-Dome (1996)
68                                        Friday (1995)
18                Ace Ventura: When Nature Calls (1995)
87                                   Black Sheep (1996)
37                                  It Takes Two (1995)
7                                   Tom and Huck (1995)
1                                        Jumanji (1995)
55                       Kids of the Round Table (1995)
71                         Kicking and Screaming (1995)
74                                     Big Bully

Let's try and get the top recommendations for a few movies and see how good the recommendations are.

In [9]:
genre_recommendations('Toy Story (1995)').head(20)

12                                         Balto (1995)
86                             Dunston Checks In (1996)
33                                          Babe (1995)
47                                    Pocahontas (1995)
4                    Father of the Bride Part II (1995)
18                Ace Ventura: When Nature Calls (1995)
37                                  It Takes Two (1995)
51                              Mighty Aphrodite (1995)
62    Don't Be a Menace to South Central While Drink...
64                                      Bio-Dome (1996)
68                                        Friday (1995)
87                                   Black Sheep (1996)
95                        In the Bleak Midwinter (1995)
7                                   Tom and Huck (1995)
1                                        Jumanji (1995)
55                       Kids of the Round Table (1995)
3                              Waiting to Exhale (1995)
44                                    To Die For

## Collaborative Filtering Recommendation Model


Use the file **ratings.csv** first as it contains User ID, Movie IDs and Ratings. These three elements are all needed for determining the similarity of the users based on their ratings for a particular movie.


In [43]:
ratings.isnull().sum()

user_id     0
movie_id    0
rating      0
dtype: int64

In [83]:
# Fill NaN values in user_id and movie_id column with 0
ratings['user_id'] = ratings['user_id'].fillna(0)
ratings['movie_id'] = ratings['movie_id'].fillna(0)

# Replace NaN values in rating column with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

In [22]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1,3
1,1,3,5
2,1,4,4
3,1,5,3
4,1,6,3
5,1,7,3
6,1,9,3
7,1,10,2
8,1,11,4
9,1,12,4


Take a random sample of 20,000 ratings (2%) (due to limitation of personal laptop)

In [4]:
# Randomly sample 1% of the ratings dataset
small_data = ratings.sample(frac=0.02)
# Check the sample info
print(small_data.info())
small_data.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 2553 to 1650
Data columns (total 3 columns):
user_id     150 non-null int64
movie_id    150 non-null int64
rating      150 non-null int64
dtypes: int64(3)
memory usage: 4.7 KB
None


(150, 3)

In [5]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(small_data, test_size=0.2)

In [11]:
# Create two user-item matrices, one for training and another for testing
# train_data_matrix = train_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])
# test_data_matrix = test_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])

# as_matrix Deprecated
train_data_matrix = train_data.values
test_data_matrix = test_data.values

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix[:4, :4])

(120, 3)
[[33  2  4]
 [ 9 32  2]
 [53 72  1]
 [53 70  4]]


Now I use the **pairwise_distances** function from sklearn [Pearson Correlation Coefficient](https://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity). This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array.

In [19]:
from sklearn.metrics.pairwise import pairwise_distances
print(train_data.shape)
# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
print(user_correlation.shape)
print(user_correlation[1])
user_correlation[np.isnan(user_correlation)] = 0
# print(user_correlation[:4, :4])

(120, 3)
(120, 120)
[0.8925696  1.         0.98652387 0.9607334  0.9466136  0.71461779
 0.80386908 0.77558693 0.83381749 0.7535438  0.89556502 0.79301145
 0.44157249 0.75585962 0.84076305 0.45431157 0.99848993 0.87107937
 0.67102488 0.67266891 0.70685139 0.92023451 0.36279207 0.93258807
 0.4662081  0.69362538 0.31668542 0.85488633 0.92517962 0.88625701
 0.95184201 0.83212246 0.97077214 0.80337077 0.433358   0.78771673
 0.65436661 0.94220993 0.81989645 0.60126277 0.67102488 0.93600853
 0.58560135 0.90422041 0.41039837 0.844551   0.83873403 0.32464706
 0.74928928 0.99952158 0.78016751 0.99409629 0.46289853 0.68021095
 0.60126277 0.81545637 0.91463302 0.66796246 0.96351138 0.77064322
 0.91800455 0.93298815 0.83495511 0.99529623 0.86088137 0.7109328
 0.88555409 0.43378832 0.71315606 0.49114227 0.99964629 0.93600853
 0.41491411 0.66365408 0.94220993 0.6322276  0.5557509  0.25916784
 0.82018781 0.91356294 0.98769125 0.99212213 0.41374901 0.98472669
 0.767545   0.39475689 0.51784532 0.8279320

In [91]:
len(user_correlation[1])

120

In [21]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(train_data_matrix.shape)
print(item_correlation.shape)
print(item_correlation[:4, :4])

(120, 3)
(3, 3)
[[ 1.          0.00988286 -0.08155195]
 [ 0.00988286  1.          0.10771079]
 [-0.08155195  0.10771079  1.        ]]


In [72]:
# print(test_data_matrix)
# print(type(test_data_matrix))
# print(test_data_matrix.nonzero())
# print(test_data_matrix[test_data_matrix.nonzero()].flatten())

# print(train_data_matrix)
# print(train_data_matrix[:,2][0:])
print(user_correlation[16])

# print(user_correlation[0][user_correlation[0] != 1])

[0.91599249 0.99848993 0.97604582 0.94403964 0.92747473 0.75196654
 0.83533087 0.73973853 0.80223204 0.71629175 0.91865492 0.75834717
 0.3916166  0.79068602 0.80975145 0.40468707 1.         0.84278308
 0.71074232 0.63100433 0.74464293 0.89734513 0.31105194 0.95100818
 0.41690446 0.73214962 0.26409968 0.82509427 0.90293307 0.91036426
 0.96724713 0.80039939 0.95612165 0.76944498 0.38319497 0.82036925
 0.61183786 0.92238263 0.8501093  0.55645893 0.71074232 0.95393097
 0.54018678 0.92631586 0.35968307 0.81385934 0.8075531  0.27219736
 0.71177749 0.99631316 0.81335513 0.99855567 0.41350452 0.71945204
 0.55645893 0.78242758 0.93546127 0.62607144 0.97676072 0.80448775
 0.89483274 0.91180777 0.86392599 0.98847125 0.88753262 0.67122565
 0.90973615 0.38363602 0.67356942 0.53825336 0.99667576 0.95393097
 0.36430437 0.70374547 0.92238263 0.5887102  0.50924153 0.20571848
 0.78752127 0.93452532 0.97760705 0.983742   0.36311189 0.99280427
 0.73117289 0.34368729 0.47006785 0.79587267 0.64264924 0.6836

With the similarity matrix in hand, I can now predict the ratings that were not included with the data. Using these predictions, I can then compare them with the test data to attempt to validate the quality of our recommender model.

For the user-user CF case, I will look at the similarity between 2 users (A and B, for example) as weights that are multiplied by the ratings of a similar user B (corrected for the average rating of that user).

In [20]:
def get_mean(ratings, user_id):
    return np.mean([row[2] for row in ratings if row[1] == user_id])

#TODO: Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        # similarity = user_correlation
        # ratings = train_data_matrix
        
        for index, row in enumerate(ratings):
            # Get some parameter
            user_id = row[0]
            movie_id = row[1] # item
            
            # Calculate mean (each row in ratings is an array of [user_id, movie_id, rating])
            curr_user_mean = get_mean(ratings, user_id)
            
            # Numerator (Sum of sim(u, v) * (Rating(v, i) - Mean(Rating(v))) )
            numerator = np.sum(np.dot(similarity[index][similarity[index] != 1], 
                                      np.substract([ratings[:, 2]
                                          
                                      ])))
            
            # Denominator
            denominator = np.sum(similarity[index][similarity[index] != 1]) # Remove itself
            
            # Calculate new_rating
            new_rating = curr_user_mean + (numerator / denominator)
            
            # Prediction for user (index) (user_id, movie_id, new_rating)
            pred[index] = [user_id, movie_id, new_rating] 
        
    elif type == 'item':
        ...
    return pred # Return a matrix like test_data_matrix

In [23]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    print(pred)
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [22]:
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

[ 78.31313638  49.0192993   16.66756433  73.86810119  69.88663803
  24.24526078  65.72009475  27.43945197   5.84045328  66.10521735
  33.77398224   4.12080041  36.50694765  56.33191642   4.16113592
  70.54550129  51.3892098   13.06528891  47.26743534  47.10863418
  -0.37606952  52.36926963  14.25204758  -7.62131721  40.02502532
  43.63962539  -5.66465071  69.13442201  42.71976656   8.14581143
  74.73830219  44.37177681  12.889921    37.22886257  60.87214129
   9.89899614  59.31285421  55.7700163    9.91712949  21.95451647
  43.80694408  -7.76146055  33.52400763  32.19922366 -14.72323129
  76.29899188  45.34442797  14.35658015  51.72412692  42.75641535
  -0.48054227  51.30431903  30.64818009  -6.95249912  65.51507241
  25.88638908   6.59853852  46.22979186  10.78081257 -15.01060444
  62.34586004  31.26134152   0.39279845  53.32368335  15.52721592
  -6.85089927  17.33526181  32.79428305 -20.12954486  43.53990616
  65.9297929   14.53030094  50.51489919  53.82008777   4.66501303
  68.95661