## Data Preparation
Let's load this data into Python.

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column
ratings = pd.read_csv('ratings.csv', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv('users.csv', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [211]:
print(movies.shape)

(100, 3)


## Content-Based Recommendation Model

Computes similarity between movies based on movie genres. It will suggest movies that are most similar to a particular movie based on its genre.

In [212]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
print(movies.shape)


(100, 3)


In [213]:
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"['Animation', ""Children's"", 'Comedy']"
1,2,Jumanji (1995),"['Adventure', ""Children's"", 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama']"
4,5,Father of the Bride Part II (1995),['Comedy']


In [214]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(100, 44)

In [215]:
tfidf_matrix

<100x44 sparse matrix of type '<class 'numpy.float64'>'
	with 231 stored elements in Compressed Sparse Row format>

In [216]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#cosine_sim[:4, :4]
cosine_sim

array([[1.        , 0.15337409, 0.12551391, ..., 0.        , 0.        ,
        0.        ],
       [0.15337409, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12551391, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.25861841],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.25861841, 0.        ,
        1.        ]])

In [0]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

#TODO: Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    title_idx = indices[title]
    title_similarities = cosine_sim[title_idx]
    title_rankings = pd.DataFrame([titles, title_similarities], index=['Title', 'Similarity']).transpose()
    return title_rankings.sort_values(by='Similarity', ascending=False)[1:]

Let's try and get the top recommendations for a few movies and see how good the recommendations are.

In [218]:
genre_recommendations('Toy Story (1995)').head(20)

Unnamed: 0,Title,Similarity
12,Balto (1995),0.819159
86,Dunston Checks In (1996),0.702985
33,Babe (1995),0.582071
47,Pocahontas (1995),0.50019
68,Friday (1995),0.275812
87,Black Sheep (1996),0.275812
37,It Takes Two (1995),0.275812
51,Mighty Aphrodite (1995),0.275812
64,Bio-Dome (1996),0.275812
18,Ace Ventura: When Nature Calls (1995),0.275812


## Collaborative Filtering Recommendation Model


Use the file **ratings.csv** first as it contains User ID, Movie IDs and Ratings. These three elements are all needed for determining the similarity of the users based on their ratings for a particular movie.


In [0]:
# Fill NaN values in user_id and movie_id column with 0
ratings['user_id'] = ratings['user_id'].fillna(0)
ratings['movie_id'] = ratings['movie_id'].fillna(0)

# Replace NaN values in rating column with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

Take a random sample of 20,000 ratings (2%) (due to limitation of personal laptop)

In [235]:
# Randomly sample 1% of the ratings dataset
small_data = ratings.sample(frac=0.02)
# Check the sample info
print(small_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 2210 to 4126
Data columns (total 3 columns):
user_id     150 non-null int64
movie_id    150 non-null int64
rating      150 non-null int64
dtypes: int64(3)
memory usage: 4.7 KB
None


In [0]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(small_data, test_size=0.2)

In [248]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])
test_data_matrix = test_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix[:4, :4])

(120, 3)
[[53 66  2]
 [67 91  3]
 [32 47  5]
 [52 32  2]]


  """Entry point for launching an IPython kernel.
  


Now I use the **pairwise_distances** function from sklearn [Pearson Correlation Coefficient](https://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity). This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array.

In [241]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[ 1.          0.79928029 -0.14215381  0.99594166]
 [ 0.79928029  1.          0.48123469  0.85012348]
 [-0.14215381  0.48123469  1.         -0.05248978]
 [ 0.99594166  0.85012348 -0.05248978  1.        ]]


In [268]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.         -0.03793381 -0.05572371]
 [-0.03793381  1.          0.04664773]
 [-0.05572371  0.04664773  1.        ]]


With the similarity matrix in hand, I can now predict the ratings that were not included with the data. Using these predictions, I can then compare them with the test data to attempt to validate the quality of our recommender model.

For the user-user CF case, I will look at the similarity between 2 users (A and B, for example) as weights that are multiplied by the ratings of a similar user B (corrected for the average rating of that user).

In [0]:
#TODO: Function to predict ratings
def predict(ratings, similarity, type='user'):
    pred = np.zeros(ratings.shape)
    if type == 'user':
        for row, rating in enumerate(ratings):
            pred[row] = [rating[0], rating[1], np.sum(np.dot(similarity[row][row:], ratings[:,2][row:])) / np.sum(similarity[row][row:])]
    elif type == 'item':
        print(similarity)
        for col, rating in enumerate(ratings.T):
            if col == 2:
              pred[:, col] = np.sum(np.dot(np.array([similarity[:, col]]).T, np.array([ratings[:, col]])), axis=0)
            else:
              pred[:, col] = ratings[:, col]
    return pred

In [0]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    print(pred)
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [324]:
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

[[ 1.         -0.03793381 -0.05572371]
 [-0.03793381  1.          0.04664773]
 [-0.05572371  0.04664773  1.        ]]
[18.          7.          3.42502478 83.         75.          3.4546123
  9.         56.          3.47073174 32.         15.          3.44142417
 67.         54.          3.43526702 49.         68.          3.42034417
 17.         80.          3.43189202 90.          1.          3.41684736
  7.         48.          3.43823792 78.         84.          3.40800938
 56.          2.          3.38300411 67.         71.          3.39807777
 87.          5.          3.37642037 59.          6.          3.40539732
 89.         37.          3.41349244  9.         30.          3.41143051
 33.         63.          3.428853   84.          9.          3.39673387
 79.         59.          3.42274225 36.         67.          3.44987917
 36.         62.          3.47000169 99.         12.          3.42976169
 38.         13.          3.4441307   9.         39.          3.4734629
 61.    