In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from surprise import Reader, Dataset, KNNBasic, BaselineOnly, SVD, SVDpp
from surprise.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

# Movies DF:

In [2]:
movies_df = pd.read_csv('movies.dat', sep = '::', names = ['movie_id', 'title', 'genre'], encoding = "ISO-8859-1")

movies_df.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genre     3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [4]:
# extract released year from movie title
movies_df['year'] = movies_df.title.str.extract(r"((?<=\()\d{4})")

movies_df.sample(5)

Unnamed: 0,movie_id,title,genre,year
3704,3773,House Party (1990),Comedy,1990
1990,2059,"Parent Trap, The (1998)",Children's|Drama,1998
1229,1249,Nikita (La Femme Nikita) (1990),Thriller,1990
763,773,Touki Bouki (Journey of the Hyena) (1973),Drama,1973
348,352,Crooklyn (1994),Comedy,1994


In [5]:
# earliest and latest movies
movies_df.year.min(), movies_df.year.max()

('1919', '2000')

# Ratings DF:

In [6]:
ratings_df = pd.read_csv('ratings.dat', sep = '::', names = ['user_id', 'movie_id', 'rating', 'timestamp'])

ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
# convert timestamp to date
ratings_df['date'] = pd.to_datetime(ratings_df.timestamp, unit = 's')
ratings_df.drop(columns = ['timestamp'], inplace = True)

ratings_df.sample(3)

Unnamed: 0,user_id,movie_id,rating,date
366089,2131,3264,4,2000-11-19 14:29:39
199795,1227,2071,4,2000-12-15 18:34:31
945085,5704,2840,3,2000-05-17 07:23:00


In [8]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype         
---  ------    --------------    -----         
 0   user_id   1000209 non-null  int64         
 1   movie_id  1000209 non-null  int64         
 2   rating    1000209 non-null  int64         
 3   date      1000209 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 30.5 MB


In [9]:
# earliest and latest rating
ratings_df.date.min(), ratings_df.date.max()

(Timestamp('2000-04-25 23:05:32'), Timestamp('2003-02-28 17:49:50'))

# Master DF:

In [10]:
# master dataframe with all movies and their corresponding ratings
master_df = movies_df.merge(ratings_df, on = 'movie_id')

master_df.tail()

Unnamed: 0,movie_id,title,genre,year,user_id,rating,date
1000204,3952,"Contender, The (2000)",Drama|Thriller,2000,5812,4,2001-06-09 07:34:59
1000205,3952,"Contender, The (2000)",Drama|Thriller,2000,5831,3,2001-04-02 14:52:05
1000206,3952,"Contender, The (2000)",Drama|Thriller,2000,5837,4,2002-01-24 20:04:16
1000207,3952,"Contender, The (2000)",Drama|Thriller,2000,5927,1,2001-01-18 21:15:37
1000208,3952,"Contender, The (2000)",Drama|Thriller,2000,5998,4,2001-09-29 16:30:44


In [11]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 7 columns):
 #   Column    Non-Null Count    Dtype         
---  ------    --------------    -----         
 0   movie_id  1000209 non-null  int64         
 1   title     1000209 non-null  object        
 2   genre     1000209 non-null  object        
 3   year      1000209 non-null  object        
 4   user_id   1000209 non-null  int64         
 5   rating    1000209 non-null  int64         
 6   date      1000209 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 61.0+ MB


In [12]:
# add two columns that holds number of ratings and average rating per movie

rating_count = master_df.groupby('movie_id')['rating'].count()
rating_average = master_df.groupby('movie_id')['rating'].mean()

master_df = master_df.merge(rating_count, on = 'movie_id').merge(rating_average, on = 'movie_id')

master_df.sample(5)

Unnamed: 0,movie_id,title,genre,year,user_id,rating_x,date,rating_y,rating
990586,3873,Cat Ballou (1965),Comedy|Western,1965,2225,2,2000-11-19 23:37:48,280,3.667857
245732,1019,"20,000 Leagues Under the Sea (1954)",Adventure|Children's|Fantasy|Sci-Fi,1954,602,5,2000-12-03 21:12:22,575,3.702609
786722,2926,Hairspray (1988),Comedy|Drama,1988,2411,4,2000-11-18 23:36:09,445,3.350562
799438,2985,Robocop (1987),Action|Crime|Sci-Fi,1987,606,2,2000-12-03 18:22:07,1229,3.451587
873787,3273,Scream 3 (2000),Horror|Mystery|Thriller,2000,2576,1,2000-11-10 23:13:31,577,2.733102


In [13]:
#final touches ✨
master_df.rename(columns = {'rating_x' : 'rating', 'rating_y' : 'rating_count', 'rating' : 'rating_average'}, inplace = True)

master_df = master_df[['movie_id', 'title', 'year', 'genre', 'rating_average', 'rating_count', 'user_id', 'rating', 'date']]

master_df.sample()

Unnamed: 0,movie_id,title,year,genre,rating_average,rating_count,user_id,rating,date
348948,1258,"Shining, The (1980)",1980,Horror,4.104876,1087,409,5,2000-12-08 15:50:22


# Rank RS:

> *Rank-Based is a simple recommender system that recommends most popular movies according to some criteria.*

In [14]:
rank_df = master_df.copy()

In [15]:
# mean rating of all movies
c = rank_df.rating_average.mean() # average rating of all movies
c

3.5815644530324406

In [16]:
# only consider top 20% rated movies  
m = rank_df.rating_count.quantile(0.8)
rank_df = rank_df[rank_df.rating_count >= m]

rank_df.shape

(200187, 9)

In [17]:
def rating_score(df, c = c, m = m):
    '''
    Calculating a weighted rating score that considers both number_of_votes and average_rating per movie
    
    Input: a movies dataframe df, a threshold for number of ratings q
    
    return: the weighted rating score for each movie
    '''
    v = df.rating_count # number of ratings per movie
    r = df.rating_average # average rating per movie
    
    return (v * r / (v + m)) + (m * c / (v + m)) # IMBD weighted rating formula

In [18]:
rank_df['score'] = rank_df.apply(rating_score, axis = 1)

rank_df.sample()

Unnamed: 0,movie_id,title,year,genre,rating_average,rating_count,user_id,rating,date,score
284249,1136,Monty Python and the Holy Grail (1974),1974,Comedy,4.33521,1599,323,4,2000-12-16 13:32:10,4.002626


In [19]:
def ranked_rs(df):
    '''
      Rank-based recommender system
      
      Input: movies dataframe df
      
      Return: Top 10 movies according to chosen criteria
      
      '''
    ranked = ['rating_count', 'rating_average', 'score']
    choice = int(input('press 0 for most popular movies, 1 for top rated, 2 for weighted rating: '))
    
    df = df.sort_values(ranked[choice], ascending = False).drop_duplicates(subset = 'title').head(10)
    
    return df

In [20]:
ranked_rs(rank_df)

press 0 for most popular movies, 1 for top rated, 2 for weighted rating: 2


Unnamed: 0,movie_id,title,year,genre,rating_average,rating_count,user_id,rating,date,score
86727,318,"Shawshank Redemption, The (1994)",1994,Drama,4.554558,2227,6040,4,2000-04-25 23:10:57,4.20244
68259,260,Star Wars: Episode IV - A New Hope (1977),1977,Action|Adventure|Fantasy|Sci-Fi,4.453694,2991,1685,4,2000-11-20 08:45:46,4.194762
208267,858,"Godfather, The (1972)",1972,Action|Crime|Drama,4.524966,2223,4172,5,2000-08-03 20:26:00,4.183166
143211,527,Schindler's List (1993),1993,Drama|War,4.510417,2304,3984,4,2000-08-07 01:36:49,4.18153
303847,1198,Raiders of the Lost Ark (1981),1981,Action|Adventure,4.477725,2514,5627,5,2000-05-22 23:58:52,4.178056
23495,50,"Usual Suspects, The (1995)",1995,Crime|Thriller,4.517106,1783,724,5,2000-11-29 18:43:33,4.129191
747799,2762,"Sixth Sense, The (1999)",1999,Thriller,4.406263,2459,5593,4,2000-05-25 03:35:20,4.126415
768371,2858,American Beauty (1999),1999,Comedy|Drama,4.317386,3428,3023,3,2000-10-02 07:26:30,4.119274
166183,593,"Silence of the Lambs, The (1991)",1991,Drama|Thriller,4.351823,2578,2009,5,2000-11-19 23:41:26,4.098546
555811,2028,Saving Private Ryan (1998),1998,Action|Drama|War,4.337354,2653,1938,3,2000-11-20 04:34:41,4.093594


# Knowladge based:

In [21]:
genres_df = master_df.copy()

In [22]:
#expand genres:

genres_df['genres'] = genres_df.genre.str.split('|')
genres_df.drop(columns = ['genre'], inplace = True)
genres_df = genres_df.explode("genres")

genres_df.head()

Unnamed: 0,movie_id,title,year,rating_average,rating_count,user_id,rating,date,genres
0,1,Toy Story (1995),1995,4.146846,2077,1,5,2001-01-06 23:37:48,Animation
0,1,Toy Story (1995),1995,4.146846,2077,1,5,2001-01-06 23:37:48,Children's
0,1,Toy Story (1995),1995,4.146846,2077,1,5,2001-01-06 23:37:48,Comedy
1,1,Toy Story (1995),1995,4.146846,2077,6,4,2000-12-31 04:30:08,Animation
1,1,Toy Story (1995),1995,4.146846,2077,6,4,2000-12-31 04:30:08,Children's


In [23]:
genres_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2101815 entries, 0 to 1000208
Data columns (total 9 columns):
 #   Column          Dtype         
---  ------          -----         
 0   movie_id        int64         
 1   title           object        
 2   year            object        
 3   rating_average  float64       
 4   rating_count    int64         
 5   user_id         int64         
 6   rating          int64         
 7   date            datetime64[ns]
 8   genres          object        
dtypes: datetime64[ns](1), float64(1), int64(4), object(3)
memory usage: 160.4+ MB


In [24]:
genres_df.year = genres_df.year.astype('int')

genres_df.year.dtype

dtype('int64')

In [25]:
def knowladge_rs(df):
    '''
    Knowledge-Based Recommender System

    Return: Top 10 movies (IMDB Formula) according to user's preferences
    
    '''
    genre = input('Enter genres space seprated: ').title().split()
    f_year = int(input('Enter earliest year: '))
    l_year = int(input('Enter latest year: '))
    
    # filter the datframe based on user's prefrences
    knowladge_df = df.query('genres == @genre & year >= @f_year & year <= @l_year')
    
    ck = knowladge_df.rating_average.mean()
    mk = knowladge_df.rating_count.quantile(0.8)
    knowladge_df = knowladge_df[knowladge_df.rating_count >= mk]
    
    #calculating weighted score for each movie
    knowladge_df['score'] = knowladge_df.apply(rating_score, args = (mk, ck), axis = 1)
    
    knowladge_df = knowladge_df.sort_values('score', ascending = False).drop_duplicates(subset = 'title').head(10)
    
    return knowladge_df

In [26]:
knowladge_rs(genres_df)

Enter genres space seprated: thriller
Enter earliest year: 1990
Enter latest year: 2000


Unnamed: 0,movie_id,title,year,rating_average,rating_count,user_id,rating,date,genres,score
187952,733,"Rock, The (1996)",1996,3.723134,1340,4353,4,2000-08-02 05:03:33,Thriller,7.167586
24944,50,"Usual Suspects, The (1995)",1995,4.517106,1783,5572,5,2000-05-26 00:42:26,Thriller,7.105877
454236,1610,"Hunt for Red October, The (1990)",1990,4.052058,1652,3985,4,2000-08-07 04:55:26,Thriller,6.846656
441144,1573,Face/Off (1997),1997,3.401126,1421,83,4,2000-12-25 04:07:53,Thriller,6.650496
122737,457,"Fugitive, The (1993)",1993,4.103258,1995,53,5,2000-12-28 03:08:17,Thriller,6.418125
724032,2699,Arachnophobia (1990),1990,3.002926,1367,2761,4,2000-11-02 06:52:48,Thriller,6.381337
106713,377,Speed (1994),1994,3.565455,1650,324,4,2000-12-10 00:05:02,Thriller,6.364452
747373,2762,"Sixth Sense, The (1999)",1999,4.406263,2459,4725,4,2000-07-12 01:03:51,Thriller,6.284515
456837,1617,L.A. Confidential (1997),1997,4.219406,2288,1727,5,2000-11-21 06:00:44,Thriller,6.238105
166455,593,"Silence of the Lambs, The (1991)",1991,4.351823,2578,2677,5,2000-11-05 03:51:16,Thriller,6.143565


# Content Based RS:

In [27]:
content_df = movies_df.copy()

In [28]:
content_df['decades'] = content_df.year.str.replace('\d$', '0')

content_df.sample(3)

Unnamed: 0,movie_id,title,genre,year,decades
1812,1881,Quest for Camelot (1998),Adventure|Animation|Children's|Fantasy,1998,1990
3159,3228,Wirey Spindell (1999),Comedy,1999,1990
3296,3365,"Searchers, The (1956)",Western,1956,1950


In [29]:
content_df.decades.nunique()

10

In [30]:
#create a soup with released decade and genres
content_df['soup'] = content_df.decades + '|' + content_df.genre

content_df.soup = content_df.soup.str.split('|').str.join(' ')

content_df.drop(columns = ['genre', 'year', 'decades'], inplace = True)

content_df.sample(3)

Unnamed: 0,movie_id,title,soup
119,121,"Boys of St. Vincent, The (1993)",1990 Drama
2990,3059,British Intelligence (1940),1940 Drama
2462,2531,Battle for the Planet of the Apes (1973),1970 Action Sci-Fi


In [31]:
#calculate similarities between movies
count_matrix = CountVectorizer().fit_transform(content_df['soup'])
cos_sim = cosine_similarity(count_matrix)

In [32]:
# construct a reverse mapping of indices and movie titles
indices = pd.Series(content_df.index, index=content_df['title']).drop_duplicates()

In [35]:
def content_RS(title, df, indices, sim = cos_sim):
    '''
    Content-Based Recommender System
    
    
    Return: top 10 most similar movies to the movie in interest
    '''
    title = title.title()
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # Get the scores of the 10 most similar movies.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [34]:
content_RS('Aladdin (1992)', content_df, indices)

1526                           Hercules (1997)
0                             Toy Story (1995)
360                      Lion King, The (1994)
547     Nightmare Before Christmas, The (1993)
591                Beauty and the Beast (1991)
626             All Dogs Go to Heaven 2 (1996)
655           James and the Giant Peach (1996)
773        Hunchback of Notre Dame, The (1996)
1050    Aladdin and the King of Thieves (1996)
1459                   Cats Don't Dance (1997)
Name: title, dtype: object

# User Based:

In [36]:
u_df = ratings_df.copy()
u_df.drop('date', axis = 1, inplace = True)

u_df.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [37]:
#split the data to train and test and make sure all users are in both dataframes
X = u_df.copy()
y = u_df['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)

In [38]:
def rmse(rs_model):
    '''
    Return: root mean square error between the actual and predicted rating
    '''
    
    user_movie = zip(X_test['user_id'], X_test['movie_id'])
    
    y_true = np.array(X_test['rating'])
    y_pred = np.array([rs_model(user, movie) for (user, movie) in user_movie])

    
    return np.sqrt(mean_squared_error(y_true, y_pred)).round(3)

In [39]:
# create a user-movie rating matrix
ratings_matrix = X_train.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')

ratings_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [40]:
full_ratings = ratings_matrix.copy().fillna(0)

In [41]:
# calculate cosine similarity scores between users' ratings
cosine_sim = cosine_similarity(full_ratings)
sim_df = pd.DataFrame(cosine_sim, index = ratings_matrix.index, columns = ratings_matrix.index)

sim_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.10776,0.094128,0.045003,0.049,0.051284,0.0,0.075052,0.109756,0.183971,...,0.051259,0.048967,0.0,0.047553,0.068173,0.094513,0.076372,0.0,0.126885,0.103564
2,0.10776,1.0,0.082368,0.129317,0.072259,0.055591,0.2176,0.091786,0.157404,0.142573,...,0.072738,0.104999,0.140085,0.020244,0.119957,0.168602,0.158391,0.066138,0.06827,0.198302
3,0.094128,0.082368,1.0,0.058687,0.073171,0.059447,0.095596,0.046361,0.092577,0.170794,...,0.0138,0.017028,0.078618,0.0,0.039828,0.113156,0.056035,0.125033,0.050743,0.056792
4,0.045003,0.129317,0.058687,1.0,0.060382,0.0,0.069956,0.036942,0.043679,0.071279,...,0.138557,0.084669,0.191695,0.0,0.024483,0.120547,0.106744,0.039852,0.025315,0.096358
5,0.049,0.072259,0.073171,0.060382,1.0,0.032767,0.125457,0.213262,0.199934,0.08431,...,0.048598,0.022248,0.046222,0.01823,0.140546,0.222567,0.086867,0.006807,0.010809,0.190026


In [42]:
def user_rs(user_id, movie_id):
    '''
    User-Based Recommender System
    
    Return: predicted weighted mean rating a user would give to a movie
    '''
    # check if the movie exists in the ratings' matrix
    if movie_id in full_ratings:
        
        #get similar users to the user in question
        sim_users = sim_df[user_id]
        # get all ratings given to that movie
        rij = full_ratings[movie_id]
        
        # drop the nulls from movie rating matrix and corresponding cosine scores from users' similarity matrix
        idx = rij[rij.isna()].index
        rij = rij.dropna()
        sim_users = sim_users.drop(idx)
        
        pred_rating = np.dot(sim_users, rij)/ sim_users.sum()
        
    else:
        # default rating for new users/movies
        pred_rating = 3.0
        
    return pred_rating

In [43]:
rmse(user_rs)

3.252

# KNN:

In [44]:
data = Dataset.load_from_df(u_df, Reader())
knn = KNNBasic()

cross_validate(knn, data, measures=['RMSE'], verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9233  0.9230  0.9204  0.9236  0.9238  0.9228  0.0012  
Fit time          18.74   19.22   18.78   19.55   20.21   19.30   0.54    
Test time         116.36  307.01  116.55  118.81  116.05  154.96  76.03   


{'test_rmse': array([0.92327138, 0.92295404, 0.92043755, 0.92358852, 0.92381164]),
 'fit_time': (18.74090313911438,
  19.221827030181885,
  18.776645183563232,
  19.55140995979309,
  20.209928035736084),
 'test_time': (116.36449098587036,
  307.0106689929962,
  116.54613780975342,
  118.80823993682861,
  116.05457997322083)}

# Baseline:
> Algorithm predicting the baseline estimate for given user and item.

In [45]:
baseline = BaselineOnly()

cross_validate(baseline, data, measures=['RMSE'], verbose = True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9095  0.9083  0.9094  0.9063  0.9093  0.9086  0.0012  
Fit time          1.56    1.44    1.63    1.47    1.71    1.56    0.10    
Test time         1.45    1.31    1.61    1.35    1.35    1.41    0.11    


{'test_rmse': array([0.90953323, 0.90832289, 0.90942513, 0.90633322, 0.90933774]),
 'fit_time': (1.5596332550048828,
  1.4422521591186523,
  1.6278040409088135,
  1.4690301418304443,
  1.7100520133972168),
 'test_time': (1.4482052326202393,
  1.3082489967346191,
  1.6084768772125244,
  1.3458728790283203,
  1.351815938949585)}

# Funk SVD:
> The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize.

In [46]:
svd = SVD()

cross_validate(svd, data, measures=['RMSE'], verbose = True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8763  0.8741  0.8749  0.8749  0.8730  0.8746  0.0011  
Fit time          7.84    7.86    8.39    8.78    8.76    8.33    0.41    
Test time         3.01    3.27    3.34    3.42    3.00    3.21    0.17    


{'test_rmse': array([0.87629293, 0.87412451, 0.87491867, 0.8748974 , 0.87297311]),
 'fit_time': (7.8443262577056885,
  7.859412908554077,
  8.394261121749878,
  8.775669813156128,
  8.759827852249146),
 'test_time': (3.0055947303771973,
  3.2685129642486572,
  3.3389570713043213,
  3.417869806289673,
  2.9977097511291504)}

# SVD++:
> The SVD++ algorithm, an extension of SVD taking into account implicit ratings.

In [47]:
svdpp = SVDpp()

cross_validate(svdpp, data, measures=['RMSE'], verbose = True)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8657  0.8589  0.8616  0.8599  0.8599  0.8612  0.0024  
Fit time          264.60  262.71  264.49  261.88  265.47  263.83  1.32    
Test time         67.53   60.16   64.15   64.71   65.41   64.39   2.41    


{'test_rmse': array([0.86565073, 0.85885306, 0.86157663, 0.85987202, 0.85990188]),
 'fit_time': (264.6012978553772,
  262.7112431526184,
  264.49145770072937,
  261.88131308555603,
  265.4696159362793),
 'test_time': (67.52817225456238,
  60.15641498565674,
  64.15143489837646,
  64.70717716217041,
  65.41272497177124)}