In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

<h1> Part I: Data Processing

<h4> Orginal data

In [178]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [179]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [180]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [181]:
# Orginal feature among all movie
genres_set = set()
for item in movies_df['genres']:
    for genres in item.split('|'):
        genres_set.add(genres)
genres_set

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

<h4> Since if the movie don't have a genres, our model will not be working. So we have to remove those movie which don't have a genre.

In [182]:
no_genre_index = movies_df[movies_df['genres']=='(no genres listed)']["movieId"].index
no_genre_movieId = np.array(movies_df[movies_df['genres']=='(no genres listed)']["movieId"])
len(no_genre_index)

34

In [183]:
ratings_df.drop(ratings_df[ratings_df['movieId'].isin(no_genre_movieId)].index, inplace=True)

In [184]:
movies_df.drop(index=no_genre_index, inplace=True)

<h4> Removed 34 movies that don't have a genre. Let's see the genres_set now 

In [185]:
genres_set = set()
for item in movies_df['genres']:
    for genres in item.split('|'):
        genres_set.add(genres)
genres_set

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [186]:
movies_df['movieId'].nunique()

9708

In [187]:
ratings_df['movieId'].nunique()

9690

<h4> As we can see the number of movieId is not consistence between two dataframe. This happen because some movies don't have any rating, so we have to add those missing movies back into the ratings_df

In [188]:
missing_movies_index = (movies_df['movieId'][movies_df['movieId'].isin(movies_df[movies_df['movieId']
                      .isin(ratings_df['movieId'])]['movieId'])==False])

In [189]:
for i in missing_movies_index:
    new_row = pd.Series({'userId':1, 'movieId':i, 'rating':-1})
    ratings_df = pd.concat([ratings_df, new_row.to_frame().T], ignore_index=True)
ratings_df
ratings_df['userId'] = ratings_df['userId'].astype(int)
ratings_df['movieId'] = ratings_df['movieId'].astype(int)

In [190]:
ratings_df['movieId'].nunique()

9708

<h4> The number of movies in ratings_df is now same as movies_df.

In [191]:
rating_movie_table = pd.pivot_table(data=ratings_df, values='rating', columns='userId', index='movieId')
rating_movie_table[rating_movie_table<0] = np.nan
rating_movie_table

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


<h4> Afterward, we have to create a feature matrix for our model 

In [192]:
dict = {}
for id in movies_df['movieId']:
    dict[id] = {}
    for genre in genres_set:
        dict[id][genre] = 0
for index, row in movies_df.iterrows():
    for genre in row['genres'].split('|'):
        dict[row['movieId']][genre] = 1

In [193]:
featrue_df = pd.DataFrame.from_dict(dict)
featrue_df 

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
Fantasy,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
Musical,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Romance,0,0,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Action,0,0,0,0,0,1,0,0,1,1,...,1,0,0,0,0,1,0,0,1,0
Adventure,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
War,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Western,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Mystery,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sci-Fi,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Thriller,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


<h4> Since the number of feature of movies is different in different movies, it would be better if we normlize the feature in each movie  

In [220]:
featrue_df = featrue_df/featrue_df.sum()
featrue_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
Fantasy,0.2,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.25,0.333333,0.0,0.0,0.0
Musical,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Romance,0.0,0.0,0.5,0.333333,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Action,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,1.0,0.333333,...,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.5,0.0
Adventure,0.2,0.333333,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
War,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Western,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mystery,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sci-Fi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Thriller,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h1> Part II: Modeling (Linear)

In [195]:
# train test split
train_index, test_index = train_test_split(featrue_df.columns, test_size=.2, train_size=.8, random_state=101)

# training set 
feature_matrix_train = featrue_df[train_index].to_numpy().T
feature_matrix_train = np.hstack((np.ones((len(feature_matrix_train), 1), dtype=int), feature_matrix_train))
rating_matrix_train = rating_movie_table.loc[train_index].to_numpy()

# testing set
feature_matrix_test = featrue_df[test_index].to_numpy().T
feature_matrix_test  = np.hstack((np.ones((len(feature_matrix_test), 1), dtype=int), feature_matrix_test))
rating_matrix_test = rating_movie_table.loc[test_index].to_numpy()

In [196]:
# Train the weighting vector for user 
theta_list = np.array([])
f = feature_matrix_train

for j in range(rating_matrix_train.shape[1]):
    y = rating_matrix_train.T[j].reshape((-1,1))
    theta = np.zeros((f.shape[1],1))
    lamb = 0

    # Gradient descent
    # Step size
    alpha = .001

    for i in range(500):
        theta = theta - alpha*((np.nan_to_num(f@theta-y).T@f).T+lamb*theta)

    theta_list = np.append(theta_list, theta)

theta_list = theta_list.reshape(rating_matrix_test.shape[1],-1)

In [215]:
# ith row is the parameter of user i
theta_list

array([[ 4.07007788, -0.11541042,  0.45833903, ...,  0.05251258,
         0.63571451,  0.        ],
       [ 3.4724376 ,  0.        ,  0.        , ...,  0.57667338,
         0.        , -0.02044903],
       [ 2.38330619,  0.5970223 , -0.16116558, ..., -0.41977739,
        -0.14807892,  0.        ],
       ...,
       [ 3.11019451, -0.19014485, -0.05777242, ..., -0.79952817,
         1.0036285 ,  0.87731913],
       [ 2.94919227,  0.01299759,  0.        , ...,  0.24450514,
         0.01299759, -0.00976934],
       [ 3.64049794, -0.48701151,  0.48990455, ...,  0.06383135,
         0.86406556,  0.00722028]])

In [83]:
prediction = feature_matrix_test@theta_list.T
prediction.shape

(1942, 610)

<h4> Since the range of prediction is $\mathbb{R}$, and the real rating is between 0.5 to 5, we treat the value that greater than 5 be 5 and less than 0.5 be 0.5

In [84]:
prediction[prediction>5] = 5
prediction[prediction<0.5] = 0.5

In [85]:
n = (rating_movie_table.loc[test_index].shape[0]*rating_movie_table.loc[test_index].shape[1]
    -rating_movie_table.loc[test_index].isna().sum().sum())

<h4> Here are the result

In [86]:
# MSE 
np.sum(np.nan_to_num(prediction-rating_matrix_test)**2)/n

1.1536143190603205

In [87]:
# RMSE 
np.sqrt(np.sum(np.nan_to_num(prediction-rating_matrix_test)**2)/n)

1.074064392418034

In [88]:
# MAE
np.sum(np.nan_to_num(np.abs(prediction-rating_matrix_test)))/n

0.7934694473644097

<h2> Demo Part 

In [260]:
def recommend_top_k(user_id, k):
    
    # Take the parameter of user with user_id and create the feature vector 
    w = theta_list[user_id-1,:]
    feature_matrix = featrue_df.to_numpy().T
    feature_matrix = np.hstack((np.ones((len(feature_matrix), 1), dtype=int), feature_matrix))
    
    # Prediction 
    prediction = feature_matrix@w
    prediction[prediction>5] = 5
    prediction[prediction<0.5] = 0.5
    
    # Sort the prediction in dscending order 
    top_k_movie_id = pd.Series(prediction, index=featrue_df.columns).sort_values(ascending=False).head(k)
    
    print(f'The top {k} movie(s) recommend to user with user id = {user_id}\n')
    for i in top_k_movie_id.index:
        print(movies_df[movies_df['movieId'] == i]['title'])
        print('\n')

In [261]:
recommend_top_k(52, 10)

The top 10 movie(s) recommend to user with user id = 52

2839    In Crowd, The (2000)
Name: title, dtype: object


8880    Return to Sender (2015)
Name: title, dtype: object


161    Safe (1995)
Name: title, dtype: object


473    Sliver (1993)
Name: title, dtype: object


3489    Glass House, The (2001)
Name: title, dtype: object


1289    Stranger in the House (1997)
Name: title, dtype: object


2032    Arlington Road (1999)
Name: title, dtype: object


2143    I Saw What You Did (1965)
Name: title, dtype: object


9479    American Fable (2017)
Name: title, dtype: object


3950    Swimfan (2002)
Name: title, dtype: object


