In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

# Import Data

In [2]:
movie_df = pd.read_csv('dataset/ml-lastest-small/movies.csv')
movie_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
rating_df = pd.read_csv('dataset/ml-lastest-small/ratings.csv')
rating_df.drop(columns='timestamp', inplace=True)
rating_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


# Data Prep

## Re-organize movieId

In [4]:
movie_id_label_encoder = LabelEncoder()
movie_df['movieId'] = movie_id_label_encoder.fit_transform(movie_df['movieId'])
movie_df

Unnamed: 0,movieId,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,Jumanji (1995),Adventure|Children|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,9737,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,9738,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,9739,Flint (2017),Drama
9740,9740,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
rating_df['movieId'] = movie_id_label_encoder.transform(rating_df['movieId'])
rating_df

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
1,1,2,4.0
2,1,5,4.0
3,1,43,5.0
4,1,46,5.0
...,...,...,...
100831,610,9434,4.0
100832,610,9461,5.0
100833,610,9462,5.0
100834,610,9463,5.0


## Extract item features

In [6]:
item_df = movie_df['genres'].str.get_dummies('|')
item_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9738,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9739,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9740,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Create user item dataframe

In [19]:
user_item_df = rating_df.pivot(index='userId', columns='movieId', values='rating')

# Append non rating movies
non_rating_movie_ids = item_df.index[~item_df.index.isin(user_item_df.columns)]
user_item_df[non_rating_movie_ids] = np.nan

# Re-order movie id
user_item_df = user_item_df.loc[:, user_item_df.columns.sort_values()]

user_item_df

movieId,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


# Content-based RecSys

## Create Item Matrix

In [20]:
item_matrix = item_df.values
item_matrix

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Create User Item Matrix

In [21]:
user_item_matrix = user_item_df.values
user_item_matrix

array([[4. , nan, 4. , ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, nan, nan]])

## Create User Matrix

In [69]:
non_nan_user_item_matrix = np.nan_to_num(user_item_matrix, nan=0.0)
# user_norm = np.expand_dims(np.linalg.norm(non_nan_user_item_matrix, axis=1), axis=1)

user_matrix = non_nan_user_item_matrix.dot(item_matrix)
user_matrix = user_matrix / np.expand_dims(user_matrix.sum(axis=1), axis=1)
user_matrix

array([[0.        , 0.12838284, 0.12310231, ..., 0.07524752, 0.03267327,
        0.00990099],
       [0.        , 0.1489726 , 0.04280822, ..., 0.12671233, 0.01541096,
        0.0119863 ],
       [0.        , 0.19230769, 0.11538462, ..., 0.11153846, 0.00961538,
        0.        ],
       ...,
       [0.        , 0.12643048, 0.07990132, ..., 0.12553964, 0.00931954,
        0.00397451],
       [0.        , 0.11447811, 0.10774411, ..., 0.15488215, 0.04713805,
        0.01346801],
       [0.        , 0.13587591, 0.07222628, ..., 0.1330292 , 0.0129562 ,
        0.0090146 ]])

## Recommend

In [74]:
recommended_user_item_matrix = user_matrix.dot(item_matrix.T)
recommended_user_item_matrix

array([[0.41485149, 0.25280528, 0.15412541, ..., 0.10165017, 0.17326733,
        0.11716172],
       [0.13869863, 0.04280822, 0.11130137, ..., 0.2260274 , 0.1489726 ,
        0.09589041],
       [0.21923077, 0.17692308, 0.04423077, ..., 0.04615385, 0.2       ,
        0.03461538],
       ...,
       [0.31186185, 0.1552114 , 0.17508394, ..., 0.13191256, 0.1499349 ,
        0.13314603],
       [0.22558923, 0.13804714, 0.13131313, ..., 0.21548822, 0.12457912,
        0.07744108],
       [0.25748175, 0.12675182, 0.14434307, ..., 0.13547445, 0.15467153,
        0.11193431]])

In [96]:

def recommend_movies(user_id, k=10):
    item_scores = recommended_user_item_matrix[user_id]
    sorted_indexes = np.argsort(item_scores)[::-1]
    sorted_item_scores = item_scores[sorted_indexes]
    return movie_df.iloc[sorted_indexes[:k]]
recommend_movies(user_id=78, k=10)

Unnamed: 0,movieId,title,genres
7441,7441,Rubber (2010),Action|Adventure|Comedy|Crime|Drama|Film-Noir|...
7170,7170,Aelita: The Queen of Mars (Aelita) (1924),Action|Adventure|Drama|Fantasy|Romance|Sci-Fi|...
3608,3608,"Stunt Man, The (1980)",Action|Adventure|Comedy|Drama|Romance|Thriller
6358,6358,Blood Diamond (2006),Action|Adventure|Crime|Drama|Thriller|War
400,400,"Getaway, The (1994)",Action|Adventure|Crime|Drama|Romance|Thriller
8597,8597,Dragonheart 2: A New Beginning (2000),Action|Adventure|Comedy|Drama|Fantasy|Thriller
5161,5161,"Day After Tomorrow, The (2004)",Action|Adventure|Drama|Sci-Fi|Thriller
5665,5665,"Sound of Thunder, A (2005)",Action|Adventure|Drama|Sci-Fi|Thriller
8590,8590,Jurassic World (2015),Action|Adventure|Drama|Sci-Fi|Thriller
7767,7767,The Hunger Games (2012),Action|Adventure|Drama|Sci-Fi|Thriller
