In [1]:
from functools import partial
import re

import numpy as np
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [2]:
def parse_date_in_title(title: str):
    date_pattern = "\(\d\d\d\d\)$"
    match = re.search(date_pattern, title)
    if match is None:
        return match
    
    date = match.group()[1:5]
    return date

In [3]:
def get_value_for_key(df, key, col_key, col_value):
    found_key = df.loc[df[col_key] == key, col_value].values
    if found_key.shape[0] != 1:
        return None
    
    return found_key[0]

In [4]:
df = pd.read_csv("../data/movies_small.csv")

In [5]:
# parse date
df['date'] = df['title'].apply(parse_date_in_title)

In [6]:
def match_world_in_list(list_to_check, world, separator="|"):
    list_to_check = list_to_check.split(separator)
    if world in list_to_check:
        return 1
    return 0

list_genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
for genre in list_genres:
    func_find_genre = partial(match_world_in_list, world=genre)
    df["is_" + genre] = df['genres'].apply(func_find_genre)

In [7]:
#sum(df.movieId.duplicated())
get_value_for_key(df, key=1, col_key='movieId', col_value='title')

'Toy Story (1995)'

In [8]:
df

Unnamed: 0,movieId,title,genres,date,is_Action,is_Adventure,is_Animation,is_Children's,is_Comedy,is_Crime,...,is_Fantasy,is_Film-Noir,is_Horror,is_Musical,is_Mystery,is_Romance,is_Sci-Fi,is_Thriller,is_War,is_Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,1995,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,2017,1,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,2017,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
9739,193585,Flint (2017),Drama,2017,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,2018,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_ratings = pd.read_csv("../data/ratings_small.csv")
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [10]:
df_ratings_by_movie = df_ratings[['movieId', 'userId']].groupby("movieId").agg({
    'userId': 'count'
})
df_ratings_by_movie.columns = ['nb_ratings']

In [11]:
df_ratings_by_movie

Unnamed: 0_level_0,nb_ratings
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


In [12]:
df_ratings_by_movie.describe()

Unnamed: 0,nb_ratings
count,9724.0
mean,10.369807
std,22.401005
min,1.0
25%,1.0
50%,3.0
75%,9.0
max,329.0


In [13]:
df_ratings_by_movie.sum()

nb_ratings    100836
dtype: int64

In [14]:
df_ratings_by_user = df_ratings[['movieId', 'userId']].groupby("userId").agg({
    'movieId': 'count'
})
df_ratings_by_user.columns = ['nb_ratings']
df_ratings_by_user.describe()

Unnamed: 0,nb_ratings
count,610.0
mean,165.304918
std,269.480584
min,20.0
25%,35.0
50%,70.5
75%,168.0
max,2698.0


In [15]:
reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
# Run 5-fold cross-validation and then print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8778  0.8703  0.8764  0.8721  0.8749  0.8743  0.0027  
MAE (testset)     0.6732  0.6700  0.6737  0.6704  0.6698  0.6714  0.0017  
Fit time          3.47    3.60    6.24    6.33    6.18    5.17    1.33    
Test time         0.14    0.15    0.23    0.15    0.20    0.17    0.04    


{'test_rmse': array([0.87784051, 0.87033215, 0.87638992, 0.8721142 , 0.87490178]),
 'test_mae': array([0.67317278, 0.66997856, 0.67367874, 0.67035332, 0.6698174 ]),
 'fit_time': (3.470283269882202,
  3.603797435760498,
  6.23813009262085,
  6.334290027618408,
  6.183409690856934),
 'test_time': (0.13700032234191895,
  0.1498734951019287,
  0.23496794700622559,
  0.1505451202392578,
  0.19971966743469238)}

In [16]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff240319ca0>

In [17]:
df_ratings[df_ratings['userId'] == 610]

Unnamed: 0,userId,movieId,rating,timestamp
99534,610,1,5.0,1479542900
99535,610,6,5.0,1493850345
99536,610,16,4.5,1479542171
99537,610,32,4.5,1479543331
99538,610,47,5.0,1479545853
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [18]:
svd.predict(uid=5, iid=515, r_ui=3)

Prediction(uid=5, iid=515, r_ui=3, est=3.4342907186263876, details={'was_impossible': False})

In [19]:
get_value_for_key(df, key=515, col_key='movieId', col_value='title')

'Remains of the Day, The (1993)'