In [2]:
import random

import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from scipy.spatial.distance import pdist, squareform
from scipy.stats import pearsonr
from sklearn.metrics import (jaccard_score, mean_absolute_error,
                             mean_squared_error, r2_score)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [42]:
def load_data(path):
    dataset = pd.read_csv(path)
    return dataset


def merge_data(items, ratings, commun_feature):
    return pd.merge(items, ratings, on=commun_feature)


def create_ratings_matrix(merged_data, index, columns, values):
    return merged_data.pivot_table(index=index, columns=columns, values=values)


def fill_ratings_matrix(ratings_matrix):
    avg_ratings = ratings_matrix.mean(axis=1)
    ratings_matrix_centered = ratings_matrix.sub(avg_ratings, axis=0)
    ratings_matrix_centered_normed = ratings_matrix_centered.fillna(0)
    return ratings_matrix_centered_normed


def cosine_similarity_dataframe(ratings_matrix_filled):
    similarities = cosine_similarity(ratings_matrix_filled)
    return pd.DataFrame(
        similarities,
        index=ratings_matrix_filled.index,
        columns=ratings_matrix_filled.index,
    )


def get_nearest_neighbors(user_id, cosine_similarity_df, ratings_matrix):
    user_similarity_series = cosine_similarity_df.loc[user_id]
    ordered_similarities = user_similarity_series.sort_values(ascending=False)
    nearest_neighbors = ordered_similarities[1:11].index
    return nearest_neighbors


def get_neighbor_ratings(nearest_neighbors, ratings_matrix):
    neighbor_ratings = ratings_matrix.reindex(nearest_neighbors)
    return neighbor_ratings


def calculate_mean_neighbor_rating(neighbor_ratings, item_id):
    return neighbor_ratings[item_id].mean()


def predict_user_rating(user_id, item_id, cosine_similarity_df, ratings_matrix):
    nearest_neighbors = get_nearest_neighbors(
        user_id, cosine_similarity_df, ratings_matrix
    )
    neighbor_ratings = get_neighbor_ratings(nearest_neighbors, ratings_matrix)
    return calculate_mean_neighbor_rating(neighbor_ratings, item_id)


# using knn
def train_user_knn_model(ratings_matrix, user_id, item_id):
    ratings_matrix_filled_copy = fill_ratings_matrix(
        ratings_matrix
    ).copy()  # Make a copy of the DataFrame
    ratings_matrix_filled_copy.drop(item_id, axis=1, inplace=True)
    target_user_x = ratings_matrix_filled_copy.loc[[user_id]]
    other_users_y = ratings_matrix[item_id]
    other_users_x = ratings_matrix_filled_copy[other_users_y.notnull()]
    other_users_y.dropna(inplace=True)
    user_knn = KNeighborsRegressor(
        metric="cosine", n_neighbors=min(30, len(other_users_x))
    )
    user_knn.fit(other_users_x, other_users_y)
    user_user_pred = user_knn.predict(target_user_x)
    return user_user_pred[0]


# using SVD
def svd_matrix_decomposition(ratings_matrix_filled):
    U, sigma, Vt = svds(ratings_matrix_filled.values)
    return U, sigma, Vt


def recreate_ratings_matrix(ratings_matrix, U, sigma, Vt):
    sigma = np.diag(sigma)
    U_sigma = np.dot(U, sigma)
    U_sigma_Vt = np.dot(U_sigma, Vt)
    avg_ratings = ratings_matrix.mean(axis=1)
    uncentered_ratings = U_sigma_Vt + avg_ratings.values.reshape(-1, 1)
    pred_ratings_df = pd.DataFrame(
        uncentered_ratings, index=ratings_matrix.index, columns=ratings_matrix.columns
    )
    return pred_ratings_df


# content-based
def create_genre_matrix(movies):
    movies["genres_list"] = movies["genres"].apply(lambda x: x.split("|"))
    unique_genres = set()
    for genres in movies["genres_list"]:
        unique_genres.update(genres)
    matrix = pd.DataFrame(
        0, index=movies["movieId"], columns=list(unique_genres)
    )  # Convert set to list
    for index, row in movies.iterrows():
        movie_id = row["movieId"]
        for genre in row["genres_list"]:
            matrix.loc[movie_id, genre] = 1

    return matrix


def jaccard_dataframe(genres_matrix):
    jaccard_distances = pdist(genres_matrix.values, metric="jaccard")
    jaccard_similarity_array = 1 - squareform(jaccard_distances)
    jaccard_similarity_df = pd.DataFrame(
        jaccard_similarity_array, index=genres_matrix.index, columns=genres_matrix.index
    )
    return jaccard_similarity_df


def get_top_recommendations(jaccard_similarity_df, movie_id, k):
    similarities = jaccard_similarity_df.loc[movie_id]
    top_similarities = similarities.sort_values(ascending=False).head(k + 1)
    result = [get_title_by_id(i) for i in top_similarities[1:].index]
    return result


def retrieve_user_ratings(user_id, ratings_matrix):
    # Retrieve the user's ratings as a Series
    user_ratings = ratings_matrix.loc[user_id]

    # Convert the Series to a dictionary
    ratings_dict = user_ratings.to_dict()

    return ratings_dict


def get_rating(user_id, item, ratings_matrix):
    return ratings_matrix.loc[user_id, item]


def get_title_by_id(movie_id):
    return movies.loc[movies["movieId"] == movie_id, "title"].values[0]


def get_last_user_id(ratings_matrix):
    return ratings_matrix.index[-1]


def get_num_items_rated(ratings_matrix, user_id):
    user_ratings = ratings_matrix.loc[user_id]
    num_items_rated = user_ratings.notnull().sum()
    return num_items_rated


def get_top_k_ratings(user_id, mat_dataframe, k):
    user_ratings = mat_dataframe.loc[user_id]
    top_k_ratings = user_ratings.sort_values(ascending=False).head(k)
    top_k_movie_ids = top_k_ratings.index.tolist()
    return top_k_movie_ids


def get_item_details(dataset, item_id):
    keys = ["movie_id", "title", "genres"]
    arr = dataset.iloc[item_id].values
    return dict(zip(keys, [int(arr[0]), arr[1], arr[2]]))


def get_average_rating(ratings_matrix, item_id):
    return ratings_matrix[item_id].mean()

In [43]:
movies = load_data("data/movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [44]:
movies

{'movie_id': 2, 'title': 'Jumanji (1995)', 'genres': 'Adventure|Children|Fantasy'}


In [5]:
ratings = load_data("data/ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
highest_value = ratings['rating'].max()
lowest_value = ratings['rating'].min()
print(highest_value, lowest_value)

5.0 0.5


In [7]:
merged_data = merge_data(ratings, movies,'movieId')
merged_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [45]:
ratings_matrix = create_ratings_matrix(merged_data,"userId",'movieId','rating')
ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [55]:
average_rating = ratings_matrix[2].mean()
average_rating

np.float64(3.4318181818181817)

In [9]:
ratings_matrix_filled = fill_ratings_matrix(ratings_matrix)
ratings_matrix_filled

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.000000,-0.366379,0.0,0.0,-0.366379,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,0.000000,0.000000,0.0,0.0,0.000000,-1.157399,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.213904,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-0.634176,-1.134176,-1.134176,0.0,0.0,0.000000,0.000000,0.0,0.0,0.865824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,-0.270270,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.729730,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
cosine_similarity_df = cosine_similarity_dataframe(ratings_matrix_filled)
cosine_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.006200,0.047013,0.019510,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.000000,0.000000,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.000000,0.003012,...,-0.050551,-0.031581,-0.001688,0.000000,0.000000,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.000000,1.000000,-0.011260,-0.031539,0.004800,0.000000,-0.032471,0.000000,0.000000,...,-0.004904,-0.016117,0.017749,0.000000,-0.001431,-0.037289,-0.007789,-0.013001,0.000000,0.019550
4,0.048419,-0.017164,-0.011260,1.000000,-0.029620,0.013956,0.058091,0.002065,-0.005874,0.051590,...,-0.037687,0.063122,0.027640,-0.013782,0.040037,0.020590,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.029620,1.000000,0.009111,0.010117,-0.012284,0.000000,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.012016,0.006226,-0.037289,0.020590,0.026319,-0.009137,0.028326,0.022277,0.031633,-0.039946,...,0.053683,0.016384,0.098011,0.061078,0.019678,1.000000,0.017927,0.056676,0.038422,0.075464
607,0.055261,-0.020504,-0.007789,0.014628,0.031896,0.045501,0.030981,0.048822,-0.012161,-0.017656,...,0.049059,0.038197,0.049317,0.002355,-0.029381,0.017927,1.000000,0.044514,0.019049,0.021860
608,0.075224,-0.006001,-0.013001,-0.037569,-0.001751,0.021727,0.028414,0.071759,0.032783,-0.052000,...,0.069198,0.051388,0.012801,0.006319,-0.007978,0.056676,0.044514,1.000000,0.050714,0.054454
609,-0.025713,-0.060091,0.000000,-0.017884,0.093829,0.053017,0.008754,0.077180,0.000000,-0.040090,...,0.043465,0.062400,0.015334,0.094038,-0.054722,0.038422,0.019049,0.050714,1.000000,-0.012471


In [11]:
column_ids = ratings_matrix.columns.tolist()
row_ids = ratings_matrix.index.tolist()

In [12]:
user_id_test = random.choice(row_ids)
item_id_test = random.choice(column_ids)
print(user_id_test, item_id_test)

205 2882


In [13]:
knn_result = train_user_knn_model(ratings_matrix, 1, 2)
knn_result

np.float64(3.1)

In [14]:
#predict_user_rating_with_knn(knn[0], knn[1])

In [15]:
U, sigma, Vt = svd_matrix_decomposition(ratings_matrix_filled)

In [16]:
res = recreate_ratings_matrix(ratings_matrix, U, sigma, Vt)

In [17]:
res

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.462314,4.349185,4.389132,4.355828,4.304889,4.379954,4.339853,4.353081,4.371022,4.304191,...,4.366366,4.366333,4.366399,4.366399,4.366366,4.366399,4.366366,4.366366,4.366366,4.366969
2,3.947867,3.948730,3.943345,3.948975,3.954781,3.955014,3.950488,3.949590,3.946458,3.960781,...,3.948267,3.948247,3.948288,3.948288,3.948267,3.948288,3.948267,3.948267,3.948267,3.948270
3,2.406305,2.439618,2.420748,2.440964,2.468875,2.461135,2.451152,2.439374,2.424418,2.481790,...,2.435862,2.435774,2.435949,2.435949,2.435862,2.435949,2.435862,2.435862,2.435862,2.435745
4,3.616146,3.556886,3.576326,3.551370,3.500886,3.497739,3.521390,3.555289,3.562753,3.490580,...,3.555621,3.555780,3.555462,3.555462,3.555621,3.555462,3.555621,3.555621,3.555621,3.555394
5,3.653970,3.633074,3.646911,3.631662,3.620151,3.628583,3.628683,3.635113,3.640487,3.596743,...,3.636387,3.636443,3.636330,3.636330,3.636387,3.636330,3.636387,3.636387,3.636387,3.636556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.897556,3.566707,3.688992,3.599346,3.434501,3.736443,3.544275,3.646346,3.650694,3.282278,...,3.657630,3.658191,3.657068,3.657068,3.657630,3.657068,3.657630,3.657630,3.657630,3.659773
607,3.818336,3.773703,3.799590,3.778775,3.742178,3.779247,3.769289,3.778643,3.796693,3.733471,...,3.786128,3.786204,3.786051,3.786051,3.786128,3.786051,3.786128,3.786128,3.786128,3.786412
608,3.446540,2.838885,2.682081,3.117258,2.706645,3.795240,2.909024,3.114491,2.974129,3.601215,...,3.134140,3.134052,3.134227,3.134227,3.134140,3.134227,3.134140,3.134140,3.134140,3.134150
609,3.286156,3.265451,3.271643,3.267343,3.260062,3.277101,3.266075,3.268635,3.272298,3.258482,...,3.270272,3.270277,3.270267,3.270267,3.270272,3.270267,3.270272,3.270272,3.270272,3.270444


In [18]:
retrieve_user_ratings(1, res)

{1: 4.462313662123096,
 2: 4.349184760846742,
 3: 4.3891316842179595,
 4: 4.355828353129003,
 5: 4.304888795824397,
 6: 4.379954102943742,
 7: 4.339852523940194,
 8: 4.353080996340178,
 9: 4.371022439938943,
 10: 4.304191356317446,
 11: 4.37188295180519,
 12: 4.363858349060715,
 13: 4.361602865950362,
 14: 4.373337522852629,
 15: 4.353424086512787,
 16: 4.4315291866177144,
 17: 4.393346940682478,
 18: 4.368322340288959,
 19: 4.281451114465518,
 20: 4.373323691439459,
 21: 4.362719577121527,
 22: 4.342648750233089,
 23: 4.3528204509282205,
 24: 4.34546864036523,
 25: 4.3846395191285765,
 26: 4.37544213772146,
 27: 4.340233235244234,
 28: 4.3697950242101165,
 29: 4.394351846085684,
 30: 4.368750951557443,
 31: 4.345364746496489,
 32: 4.511604435496003,
 34: 4.420093355390348,
 36: 4.379646505653235,
 38: 4.3618712013444885,
 39: 4.345884691109838,
 40: 4.366177811435544,
 41: 4.3798578755765405,
 42: 4.335548371629648,
 43: 4.360243297169251,
 44: 4.291331880982589,
 45: 4.35853190014440

In [31]:
top_recommendations = dict(sorted(retrieve_user_ratings(1, res).items(), key=lambda item: item[1], reverse=True))
top_recommendations


{296: 4.656915869403036,
 318: 4.636321517285147,
 527: 4.596005630061804,
 858: 4.591951381302168,
 356: 4.591608540340061,
 2959: 4.577183264638542,
 50: 4.569429385573581,
 593: 4.5677691130928535,
 260: 4.556228880250602,
 608: 4.550752412301105,
 2858: 4.541681068376915,
 1196: 4.539831256973038,
 2571: 4.535721126387303,
 1221: 4.5300582062439885,
 1197: 4.526858927113387,
 1136: 4.526260940687005,
 1198: 4.520567070607834,
 1270: 4.518606049578228,
 4226: 4.515735274475243,
 32: 4.511604435496003,
 1208: 4.509297106573461,
 904: 4.505491316118424,
 7361: 4.505075184021263,
 1089: 4.502023185599433,
 110: 4.500216972765694,
 912: 4.499169314807747,
 1213: 4.498833338421561,
 541: 4.49702688870765,
 2997: 4.49593193032172,
 4973: 4.4946487341004,
 2028: 4.494533335256532,
 778: 4.49403189394354,
 4993: 4.490207327410199,
 1193: 4.4898079334405985,
 924: 4.4873685590722685,
 5618: 4.487170941810781,
 1210: 4.487091730998728,
 47: 4.486225413322909,
 7153: 4.485992916033053,
 2329: 

In [20]:
get_rating(user_id_test, item_id_test, ratings_matrix)

np.float64(nan)

In [21]:
get_rating(user_id_test, item_id_test, res)

np.float64(3.869911077198797)

In [22]:
train_user_knn_model(ratings_matrix, user_id_test, item_id_test)


np.float64(2.0)

In [23]:
genres_matrix = create_genre_matrix(movies)
genres_matrix

Unnamed: 0_level_0,IMAX,Romance,War,(no genres listed),Fantasy,Horror,Children,Western,Thriller,Sci-Fi,Adventure,Film-Noir,Comedy,Musical,Action,Documentary,Crime,Animation,Mystery,Drama
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0
2,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0
193583,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
193585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
193587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0


In [24]:
jaccard_similarity_df = jaccard_dataframe(genres_matrix)
print(jaccard_similarity_df.head())

movieId    1       2         3         4         5       6         7       \
movieId                                                                     
1        1.000000     0.6  0.166667  0.142857  0.200000     0.0  0.166667   
2        0.600000     1.0  0.000000  0.000000  0.000000     0.0  0.000000   
3        0.166667     0.0  1.000000  0.666667  0.500000     0.0  1.000000   
4        0.142857     0.0  0.666667  1.000000  0.333333     0.0  0.666667   
5        0.200000     0.0  0.500000  0.333333  1.000000     0.0  0.500000   

movieId    8       9         10      ...    193565    193567    193571  \
movieId                              ...                                 
1        0.400000     0.0  0.142857  ...  0.285714  0.166667  0.166667   
2        0.666667     0.0  0.200000  ...  0.000000  0.000000  0.000000   
3        0.000000     0.0  0.000000  ...  0.200000  0.000000  0.333333   
4        0.000000     0.0  0.000000  ...  0.166667  0.250000  0.666667   
5        0.00000

In [25]:
jaccard_similarity_df.values

array([[1.        , 0.6       , 0.16666667, ..., 0.        , 0.16666667,
        0.2       ],
       [0.6       , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.16666667, 0.        , 1.        , ..., 0.        , 0.        ,
        0.5       ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.16666667, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.2       , 0.        , 0.5       , ..., 0.        , 0.        ,
        1.        ]])

In [26]:
top_recommendations = get_top_recommendations(jaccard_similarity_df, 5, 50)
print(top_recommendations)

['Entourage (2015)', 'Jeff Ross Roasts the Border (2017)', 'No Way Jose (2015)', 'Father of the Bride Part II (1995)', 'Ghost Graduation (2012)', 'Four Rooms (1995)', 'Andrew Dice Clay: Dice Rules (1991)', 'Love and Death (1975)', 'Ace Ventura: When Nature Calls (1995)', 'Eddie (1996)', 'Kids in the Hall: Brain Candy (1996)', 'Cop and ½ (1993)', 'Best in Show (2000)', 'Rebound (2005)', 'Ishtar (1987)', 'Tammy (2014)', 'Distinguished Gentleman, The (1992)', 'Like Father, Like Son (1987)', 'Loser (1991)', 'Tapeheads (1988)', 'My Man Godfrey (1957)', 'Parental Guidance (2012)', 'Bikini Beach (1964)', 'Hopscotch (1980)', 'Fred Armisen: Standup for Drummers (2018)', 'Tampopo (1985)', 'Designing Woman (1957)', 'Tom Segura: Disgraceful (2018)', 'When We First Met (2018)', 'Craig Ferguson: Tickle Fight (2017)', 'Maz Jobrani: Immigrant (2017)', 'Sex Tape (2014)', 'Mr. Wrong (1996)', 'Happy Gilmore (1996)', 'Confessions of a Teenage Drama Queen (2004)', 'Lady Bird (2017)', 'Madhouse (1990)', 'Fr

In [27]:
len(top_recommendations)

50

In [28]:
def collaborative_filtering(id_user):
    movies = load_data("data/movies.csv")
    ratings = load_data("data/ratings.csv")
    merged_data = merge_data(ratings, movies,'movieId')
    ratings_matrix = create_ratings_matrix(merged_data,"userId",'movieId','rating')

    if get_num_items_rated(ratings_matrix, id_user) > 20:
        ratings_matrix_filled = fill_ratings_matrix(ratings_matrix)
        cosine_similarity_df = cosine_similarity_dataframe(ratings_matrix_filled)
        column_ids = ratings_matrix.columns.tolist()

        #prediction using Matrix Factorization(SVD)
        U, sigma, Vt = svd_matrix_decomposition(ratings_matrix_filled)
        res = recreate_ratings_matrix(ratings_matrix, U, sigma, Vt)
        movies_ids_res = get_top_k_ratings(id_user,res, k = 20)
        L_svd_items = [get_title_by_id(i) for i in movies_ids_res]
        return L_svd_items
    else:
        return 'None'

In [29]:
result = collaborative_filtering(234)
result

['Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Shawshank Redemption, The (1994)',
 'Silence of the Lambs, The (1991)',
 'Matrix, The (1999)',
 'Back to the Future (1985)',
 'Princess Bride, The (1987)',
 "Schindler's List (1993)",
 'Lord of the Rings: The Return of the King, The (2003)',
 'Godfather, The (1972)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Fugitive, The (1993)',
 'Forrest Gump (1994)',
 'Indiana Jones and the Last Crusade (1989)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Monty Python and the Holy Grail (1975)',
 'Shrek (2001)',
 'Spider-Man 2 (2004)']