## importing

In [1]:
import pandas as pd
import numpy as np

from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('data/ml-latest-small/ml-latest-small/movies.csv')
ratings = pd.read_csv('data/ml-latest-small/ml-latest-small/ratings.csv')

## recommendations using correlations

In [3]:
df = pd.merge(movies, ratings, on = 'movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [18]:
# df = df[['movieId', 'userId', 'rating']]

# n_movies = df.movieId.nunique()
# n_users = df.userId.nunique()

# train_data_matrix = np.zeros((n_users, n_movies))
# test_data_matrix = np.zeros((n_users, n_movies))

# train_data, test_data = train_test_split(df, train_size = 0.25)

# user_ids = np.sort(df.userId.unique())
# movie_ids = np.sort(df.movieId.unique())

# for row in train_data.itertuples():
#     movie_id = np.where(movie_ids == row[1])
#     user_id = np.where(user_ids == row[2])
#     train_data_matrix[user_id, movie_id] = row[3]
    
# for row in test_data.itertuples():
#     movie_id = np.where(movie_ids == row[1])
#     user_id = np.where(user_ids == row[2])
#     test_data_matrix[user_id, movie_id] = row[3] 

In [59]:
correlations = pd.pivot_table(data = df, index = 'userId', columns = 'movieId', values = 'rating').corr()
correlations.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.330978,0.487109,1.0,0.310971,0.106465,0.208402,0.968246,0.095913,-0.021409,...,,,,,,,,,,
2,0.330978,1.0,0.419564,,0.562791,0.16351,0.430261,0.415227,0.27735,0.016626,...,,,,,,,,,,
3,0.487109,0.419564,1.0,,0.602266,0.345069,0.554088,0.333333,0.458591,-0.050276,...,,,,,,,,,,
4,1.0,,,1.0,0.654654,,0.203653,,,0.870388,...,,,,,,,,,,
5,0.310971,0.562791,0.602266,0.654654,1.0,0.291302,0.609119,0.555556,0.319173,0.218263,...,,,,,,,,,,


In [115]:
def recomended_movies(movieId):
    recom_movies = correlations[movieId].sort_values(ascending = False).dropna()
    
    # mean rating of movies with at least 25 ratings
    no_ratings = ratings.movieId.value_counts()
    no_ratings = no_ratings[no_ratings >= 25]
    movies_lot_ratings = no_ratings.index
    movies_lot_ratings = ratings.set_index('movieId').loc[movies_lot_ratings].reset_index()
    movies_lot_ratings = movies_lot_ratings[['movieId', 'rating']].groupby('movieId').mean().sort_values('rating', ascending = False)

    # check if there is at least 5 correlated movies (movies such that at least 1 person saw these movies and the movie given as argument to this function)
    if len(recom_movies) >= 5:
        ids = np.array(recom_movies.index[1:6])
        titles = movies.set_index('movieId').loc[ids].title.values
    else:
        ids = np.array(recom_movies.index[1:])
        titles = movies.set_index('movieId').loc[ids].title.values
        
        # if there is no 5 correlated movies other recommendations are best rated movies
        ids = np.concatenate((ids, movies_lot_ratings.index[ : 5 - len(ids)]))
        titles = np.concatenate((titles, movies.set_index('movieId').loc[ids].title.values))
    
    return {'movies_ids': ids, 'movies_titles': titles}

In [116]:
recomended_movies(49)

{'movies_ids': array([ 318,  922,  898,  475, 1204], dtype=int64),
 'movies_titles': array(['Shawshank Redemption, The (1994)',
        'Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)',
        'Philadelphia Story, The (1940)',
        'In the Name of the Father (1993)', 'Lawrence of Arabia (1962)'],
       dtype=object)}

## collaborative filtering

In [10]:
df = pd.merge(movies, ratings, on = 'movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [13]:
# creating train_data_matrix and test_data_matrix, such that train_data_matrix[i, j] is a rating of user with id = user_ids[i] for movie with id = movie_ids[j]. 
# If rate = 0 then given user didn't see that movie.

n_users = df.userId.nunique()
n_items = df.movieId.nunique()

train_data_matrix = np.zeros((n_users, n_items))
test_data_matrix = np.zeros((n_users, n_items))

train_data, test_data = train_test_split(df, test_size = 0.25)

user_ids = np.sort(df.userId.unique())
movie_ids = np.sort(df.movieId.unique())

train_data, test_data = train_test_split(df, test_size = 0.25)

for row in train_data.itertuples():
    user_id = np.where(user_ids == row[4])[0][0]
    movie_id = np.where(movie_ids == row[1])[0][0]
    rating = row[5]
    
    train_data_matrix[user_id, movie_id] = rating
    
for row in test_data.itertuples():
    user_id = np.where(user_ids == row[4])[0][0]
    movie_id = np.where(movie_ids == row[1])[0][0]
    rating = row[5]
    
    test_data_matrix[user_id, movie_id] = rating

In [14]:
print(train_data_matrix.shape)
print(test_data_matrix.shape)

(610, 9724)
(610, 9724)


In [15]:
# create similarity matrixes, user_similarities[i, j] is a similartiy between user with id user_ids[i] and user_ids[j], analogicaly for movies

user_similarities = cosine_similarity(train_data_matrix, train_data_matrix)
movie_similarities = cosine_similarity(train_data_matrix.T, train_data_matrix.T)

# make sure similarities have 1 at diagonal (the same movies or users should have similarity = 1 but vectors with only zeros [0, ..., 0] and [0, ..., 0] have similarity = 0)
for i in range(len(user_similarities)):
    user_similarities[i,i] = 1
    
for i in range(len(movie_similarities)):
    movie_similarities[i,i] = 1

In [65]:
print(user_similarities.shape)
print(movie_similarities.shape)

(610, 610)
(9724, 9724)


In [67]:
train_data_matrix.mean(axis = 1).shape

(610,)

In [150]:
# function for predicting ratings of users for movies

def predict(similarity, Type, ratings):
    if Type == 'user':
        mean_ratings = ratings.mean(axis = 1)
        mean_ratings = mean_ratings.reshape(len(mean_ratings), 1)
        
        diff_ratings = ratings - mean_ratings
        
        pred = mean_ratings + np.dot(similarity, diff_ratings) / np.array([np.abs(similarity).sum(axis = 1)]).T
        
    elif Type == 'item':
        pred = np.dot(similarity, ratings.T) / np.array([np.abs(similarity).sum(axis = 1)]).T
        
    return pred

In [119]:
# calculating root mean square error of predictions

def rmse(predictions, real_values):
    predictions = predictions.flatten()
    real_values = real_values.flatten()

    predictions = predictions[np.where(real_values != 0)[0]]
    real_values = real_values[np.where(real_values != 0)[0]]

    rmse = np.sqrt(((predictions - real_values) ** 2).sum() / len(predictions))
    
    return rmse

In [92]:
predictions = predict(user_similarities, 'user', train_data_matrix)

rmse(predictions, test_data_matrix)

3.1807971312985766

In [171]:
predictions = predict(movie_similarities, 'item', train_data_matrix)

rmse(predictions, test_data_matrix)

3.588030486986965

In [92]:
# if Type = 'user', then this function recommend movies by finding similar user and his favourite movies
# if Type = 'movie' then this function recommend movies by finding most similar movies to the given movie

def recommend_movies(Id, Type):
    if Type == 'user':
        similarities = user_similarities
        
        similarity_scores = similarities[np.where(user_ids == Id), :]
        similarity_scores = pd.DataFrame({'score': similarity_scores.flatten(), 'userId': user_ids})

        similar_user = similarity_scores.sort_values(by = 'score', ascending = False).userId.iloc[1]
        recom_movies = df.set_index('userId').loc[similar_user].sort_values(by = 'rating', ascending = False)[['title', 'movieId']].iloc[ : 5]#.title.iloc[ : 5].values
        recom_movies = {'movie_id': recom_movies.movieId.values, 'titles': recom_movies.title.values}

        return recom_movies
    elif Type == 'movie':
        similarities = movie_similarities
        
        similarity_scores = similarities[np.where(movie_ids == Id), :]
        similarity_scores = pd.DataFrame({'score': similarity_scores.flatten(), 'movieId': movie_ids})

        recom_movie_ids = similarity_scores.sort_values(by = 'score', ascending = False).movieId.iloc[1 : 6].values
        recom_movie_titles = df.set_index('movieId').loc[recom_movie_ids].title.unique()

        recommendations = {'movie_ids': recom_movie_ids, 'movie_titles': recom_movie_titles}

        return recommendations

In [93]:
Id = 1
recommend_movies(Id, 'movie')

{'movie_ids': array([ 480,  648, 2115,  296,  780], dtype=int64),
 'movie_titles': array(['Jurassic Park (1993)', 'Mission: Impossible (1996)',
        'Indiana Jones and the Temple of Doom (1984)',
        'Pulp Fiction (1994)', 'Independence Day (a.k.a. ID4) (1996)'],
       dtype=object)}

In [94]:
Id = 2
recommend_movies(Id, 'user')

{'movie_id': array([79132, 89745, 79091, 88125, 92259], dtype=int64),
 'titles': array(['Inception (2010)', 'Avengers, The (2012)', 'Despicable Me (2010)',
        'Harry Potter and the Deathly Hallows: Part 2 (2011)',
        'Intouchables (2011)'], dtype=object)}

## SVD

In [19]:
df = pd.merge(ratings, movies, on = 'movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [20]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()

train_data_matrix = np.zeros((n_users, n_items))
test_data_matrix = np.zeros((n_users, n_items))

user_ids = np.sort(df.userId.unique())
movie_ids = np.sort(df.movieId.unique())

train_data, test_data = train_test_split(df, test_size = 0.25)

for row in train_data.itertuples():
    user_id = np.where(user_ids == row[1])[0][0]
    movie_id = np.where(movie_ids == row[2])[0][0]
    rating = row[3]
    
    train_data_matrix[user_id, movie_id] = rating
    
for row in test_data.itertuples():
    user_id = np.where(user_ids == row[1])[0][0]
    movie_id = np.where(movie_ids == row[2])[0][0]
    rating = row[3]
    
    test_data_matrix[user_id, movie_id] = rating

In [21]:
train_data_matrix.shape

(610, 9724)

In [22]:
def rmse(predictions, real_values):
    predictions = predictions.flatten()
    real_values = real_values.flatten()
    
    predictions = predictions[np.where(real_values != 0)]
    real_values = real_values[np.where(real_values != 0)]
    
    rmse = np.sqrt(((predictions - real_values) ** 2).sum() / len(predictions))
    
    print(len(predictions))
    
    return rmse

In [23]:
u, s, vt = svds(train_data_matrix, k = 5)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF RMSE: ' + str(rmse(X_pred, test_data_matrix)))

25209
User-based CF RMSE: 3.0147315093540685
