In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset, SVD


import warnings; warnings.simplefilter('ignore')

The dataset consists of the following files:

movies_metadata.csv: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.

keywords.csv: Contains the movie plot keywords for our MovieLens movies. Available in the form of a stringified JSON Object.

credits.csv: Consists of Cast and Crew Information for all our movies. Available in the form of a stringified JSON Object.

links.csv: The file that contains the TMDB and IMDB IDs of all the movies featured in the Full MovieLens dataset.

links_small.csv: Contains the TMDB and IMDB IDs of a small subset of 9,000 movies of the Full Dataset.

ratings_small.csv: The subset of 100,000 ratings from 700 users on 9,000 movies.

#### let's start with the main file: movies_metadata.csv

In [2]:
# read our dataset and save it in pandas dataframe object.
orig_md = pd.read_csv('data/recmmendation_system/movies_metadata.csv')

Let's take a look at our dataset.

In [3]:
orig_md.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


it seems that some of the features has json values connected to other files. we will come back to this later. \
let's check our columns.

In [4]:
orig_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

let's make some data cleaning.

In [5]:
# let's delete all rows with non integer id.
tmp = len(orig_md[~orig_md["id"].str.match("^\d+$")])
print(f'The number of deleted rows with non integer id is:{tmp}')
orig_md = orig_md[orig_md['id'].str.match('^\d+$')]

The number of deleted rows with non integer id is:3


let's take a look at genres feature.

In [6]:
for i in orig_md['genres'][:2]:
    print(i)

[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]


let's replace each element with the value of the name key.

In [7]:
# fill null values with empty list
# evaluete the string so we can process it as list
# replace each element by the name 
orig_md['genres']= orig_md['genres'].fillna('[]').apply(literal_eval) \
                                    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) 

In [8]:
for i in orig_md['genres'][:2]:
    print(i)

['Animation', 'Comedy', 'Family']
['Adventure', 'Fantasy', 'Family']


In [9]:
supported_genre= list(set([genre for movie_genres in orig_md['genres'].values.tolist() for genre in movie_genres]))
print(supported_genre)

['Drama', 'Western', 'History', 'Romance', 'Horror', 'Family', 'Documentary', 'Fantasy', 'TV Movie', 'War', 'Comedy', 'Action', 'Adventure', 'Mystery', 'Music', 'Science Fiction', 'Foreign', 'Thriller', 'Crime', 'Animation']


now it's easier to deal with.

## Base Recommendation System
This System will recommend movies based on their popularity and genre (if user specified any). The basic rule is that movies that are popular are more have more (high) probability of being liked by the average user.


In [10]:
md = orig_md.copy()

$$ Weighted Rating (WR) = (\frac{v}{v+m} * R)+(\frac{m}{v+m} * C) $$

where:
- v is the number of votes for the movie
- m is the minimum votes required to be listed in the chart
- R is the average rating of the movie
- C is the mean vote across the whole report


In [11]:
def weighted_rating(x, m, C):
    """Calculating 𝑊𝑒𝑖𝑔ℎ𝑡𝑒𝑑𝑅𝑎𝑡𝑖𝑛𝑔 of a movie.

    Calculating 𝑊𝑒𝑖𝑔ℎ𝑡𝑒𝑑𝑅𝑎𝑡𝑖𝑛𝑔(𝑊𝑅) of a movie using the formula: 
            𝑊𝑅 =(𝑣 / (𝑣+𝑚) ∗ 𝑅) + (𝑚 / (𝑣+𝑚) ∗𝐶)

    Args:
        x (record): the feature of the movie.
        m (float) : is the minimum votes required to be listed in the chart
        C (flaot) : is the mean vote across the whole report

    Returns:
        𝑊𝑅 (flaot)
    """
    v = x['vote_count'] # the number of votes for the movie
    R = x['vote_average'] # the average rating of the movie
    return (v/(v+m) * R) + (m/(m+v) * C)

In [12]:
def get_high_score_movies(genre, percentile=0.95):
    """get a list of recommendation.

    this movie will return a list of movies depending on thier popularity and their genres.
    first will get the movies that have the genres that our user requested. 

    Args:
        x (record): the feature of the movie.
        m (float) : is the minimum votes required to be listed in the chart
        C (flaot) : is the mean vote across the whole report

    Returns:
        𝑊𝑅 (flaot)
    """
    df = md[[set(genre).issubset(md['genres'].iloc[i])  for i in range(len(md['genres']))]]
    
    if len(df) == 0:
        print('We did not find any movie')
        return pd.DataFrame()
    
    # calculate the mean vote across the whole report. (ignore the null values)
    C =  df[~df['vote_average'].isna()]['vote_average'].mean()

    # calculate the minimum votes required to be cinsidered in the results. (ignore the null values)
    # here we use quantile to get for example the top 5% popular movies.
    m = df[~df['vote_count'].isna()]['vote_count'].quantile(percentile)
    
    considered = df[(df['vote_count'] >= m) # have votes more than m
                    & (~ df['vote_count'].isna()) # asd 
                    & (~ df['vote_average'].isna())]
    if len(considered) == 0:
        print("There is no movies with satisfy your input")
        return pd.DataFrame()
    
    # get just there attributes to show.
    considered = considered[['title', 'release_date', 'genres', 'vote_count', 'vote_average', 'popularity']]
    
    # calculate weighted_rating for each record depening on the above function.
    considered['weighted rating'] = considered.apply(weighted_rating, args=(m, C), axis=1)
    
    # sort our values descending and get top 100
    considered = considered.sort_values('weighted rating', ascending=False).head(100)
    return considered

In [13]:
print(f'please chose one of these supported genres: {supported_genre}')

please chose one of these supported genres: ['Drama', 'Western', 'History', 'Romance', 'Horror', 'Family', 'Documentary', 'Fantasy', 'TV Movie', 'War', 'Comedy', 'Action', 'Adventure', 'Mystery', 'Music', 'Science Fiction', 'Foreign', 'Thriller', 'Crime', 'Animation']


In [14]:
get_high_score_movies(['TV Movie', 'Family'], percentile=0.95).head(3)

Unnamed: 0,title,release_date,genres,vote_count,vote_average,popularity,weighted rating
45258,Descendants 2,2017-07-21,"[TV Movie, Family, Action, Comedy, Music, Adve...",171.0,7.5,15.842073,6.546735
11034,High School Musical,2006-01-20,"[Comedy, Drama, Family, Music, TV Movie]",1048.0,6.1,10.187478,6.023472
13090,Camp Rock,2008-06-20,"[Comedy, Drama, Family, Music, TV Movie]",432.0,6.0,5.840379,5.870415


In [15]:
get_high_score_movies([], percentile=0.95).head(3)

Unnamed: 0,title,release_date,genres,vote_count,vote_average,popularity,weighted rating
314,The Shawshank Redemption,1994-09-23,"[Drama, Crime]",8358.0,8.5,51.645403,8.357746
834,The Godfather,1972-03-14,"[Drama, Crime]",6024.0,8.5,41.109264,8.306334
12481,The Dark Knight,2008-07-16,"[Drama, Action, Crime, Thriller]",12269.0,8.3,123.167259,8.208376


# Content Based Recommender
First we will use the small dataset so we don't waste much resources.

In [16]:
links_small = pd.read_csv('data/recmmendation_system/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

read keywords and credits datasets: \

keywords.csv: Contains the movie plot (the narrative sequence of events that determine the outcome of the characters) keywords for our MovieLens movies. Available in the form of a stringified JSON Object.

credits.csv: Consists of Cast and Crew Information for all our movies. Available in the form of a stringified JSON Object.

In [17]:
credits = pd.read_csv('data/recmmendation_system/credits.csv')
keywords = pd.read_csv('data/recmmendation_system/keywords.csv')

we need to understand what are the credits and keywords and their architecture. \

starting from credits

In [18]:
credits[:2]

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


let's check what are the keys in cast and crew fields

In [19]:
rec  = credits.iloc[0]
cast = literal_eval(rec['cast'])
print('cast attribute are:', list(cast[0].keys()))

crew = literal_eval(rec['crew'])
print('crew attribute are:', list(crew[0].keys()))

cast attribute are: ['cast_id', 'character', 'credit_id', 'gender', 'id', 'name', 'order', 'profile_path']
crew attribute are: ['credit_id', 'department', 'gender', 'id', 'job', 'name', 'profile_path']


let's dig deeper a little bit and print the cast and crew members.

In [20]:
rec  = credits.iloc[0]
print("Cast members are:")
cast = literal_eval(rec['cast'])
for c in cast[:3]:
    print({"name" : c['name'], "character": c['character']})

print("\nCrew Members are:")
crew = literal_eval(rec['crew'])
for c in crew[:3]:
    print({"name" : c['name'], "job": c['job']})

Cast members are:
{'name': 'Tom Hanks', 'character': 'Woody (voice)'}
{'name': 'Tim Allen', 'character': 'Buzz Lightyear (voice)'}
{'name': 'Don Rickles', 'character': 'Mr. Potato Head (voice)'}

Crew Members are:
{'name': 'John Lasseter', 'job': 'Director'}
{'name': 'Joss Whedon', 'job': 'Screenplay'}
{'name': 'Andrew Stanton', 'job': 'Screenplay'}


now we need to understand keywords dataset:

In [21]:
keywords[:2]

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [22]:
# printing the keywords of the first record.
rec  = keywords.iloc[0]
rec_keywords = literal_eval(rec['keywords'])
print(rec_keywords)

[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]


so the keywords of each record are a list of dicts. later we need to process that and keep just the name of the keywords.\ 


now let's starting working on merging credits and keywords with out main dataset, then we will do some data preprocessing, and finally build a content base recommendation system.

In [23]:
md = orig_md.copy()

# get movies that exist in the small dataset.
md = md[md['id'].astype('int').isin(links_small)]

In [24]:
# cast the id of our datasets to int.
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [25]:
# Merge our data set with both credits and keywords based of id.
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [26]:
print(md.columns.values)

['adult' 'belongs_to_collection' 'budget' 'genres' 'homepage' 'id'
 'imdb_id' 'original_language' 'original_title' 'overview' 'popularity'
 'poster_path' 'production_companies' 'production_countries'
 'release_date' 'revenue' 'runtime' 'spoken_languages' 'status' 'tagline'
 'title' 'video' 'vote_average' 'vote_count' 'cast' 'crew' 'keywords']


we can see that crew and cast of each movie has been added to the main dataset. now let's apply some cleaning.

In [27]:
# apply literal_eval to cast and crew to evaluate string to list of dicts.
md['cast'] = md['cast'].apply(literal_eval)
md['crew'] = md['crew'].apply(literal_eval)
md['keywords'] = md['keywords'].apply(literal_eval)

# # save the cast and crew size
# md['cast_size'] = md['cast'].apply(lambda x: len(x))
# md['crew_size'] = md['crew'].apply(lambda x: len(x))

In [28]:
def get_director(x):
    """get the name of the director.

    take a movie record from our data set, and return 
    the name of the director if it's exits else return null.

    Args:
        x (record): the feature of the movie.

    Returns:
        director name (string) or null.
    """
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [29]:
def get_producer(x):
    """get the name of the producer.

    take a movie record from our data set, and return 
    the name of the producer if it's exits else return null.

    Args:
        x (record): the feature of the movie.

    Returns:
        producer name (string) or null.
    """
    for i in x:
        if i['job'] == 'Producer':
            return i['name']
    return np.nan

In [30]:
# add two new features: the name of the director and the name of the producer
md['director'] = md['crew'].apply(get_director)
md['producer'] = md['crew'].apply(get_producer)

In [31]:
# get the name of each one of the cast.
md['cast'] = md['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# keep just 5 of the whold cast.
md['cast'] = md['cast'].apply(lambda x: x[:5] if len(x) >=5 else x)

# put underscore between first and last name to make the model not confuse Kate Hudson & Katy Perry.
md['cast'] = md['cast'].apply(lambda x: [str.lower(i.replace(" ", "_")) for i in x])

In [32]:
# get the name of each keyword without the id.
md['keywords'] = md['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# get all keywords and stack them in a pandas series object.
keywords_series = md.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)

# calculate the count of each different value in s.
keywords_series = keywords_series.value_counts()

# keep keywords that occurred more than 1. 
keywords_series = keywords_series[keywords_series > 1]

# define an English Stemmer.
stemmer = SnowballStemmer('english')

In [33]:
def filter_keywords(x): #keywords_series
    """filter keywords and keep whose in keywords_series.

    this function iterate through the keywords of x and keep just keywords that exist in keywords_series. 
    save the result in words list.

    Args:
        x (record): the keywords of a movie.
        s (pandas.core.series.Series): has our list of keywords.

    Returns:
        words (list): the list of keywords that we kept.
    """
    words = []
    # for word in x
    for word in x:
        if word in keywords_series:
            words.append(word)
    return words

In [34]:
# filter words and just the one in s.
md['keywords'] = md['keywords'].apply(filter_keywords)

# get the stem of each word of the keywords
md['keywords'] = md['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

# convert keyword to lowecase and replace space with underscore.
md['keywords'] = md['keywords'].apply(lambda x: [str.lower(i.replace(" ", "_")) for i in x])

In [35]:
# replace the space with _
md['director'] = md['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "_")))

# repaeat 3 times to increase the importance of the Director
md['director'] = md['director'].apply(lambda x: [x,x,x])

# replace the space with _
md['producer'] = md['producer'].astype('str').apply(lambda x: str.lower(x.replace(" ", "_")))

# repaeat 3 times to increase the importance of the producer
md['producer'] = md['producer'].apply(lambda x: [x,x])

In [36]:
# create now our content from the keywords 
# and the name of the cast and the name of director, producer and the genres. 
md['content'] = md['keywords'] + md['cast'] + md['director'] + md['producer'] + md['genres']
md['content'] = md['content'].apply(lambda x: ' '.join(x))

In [37]:
# calculate the count of each token in each content.
count = CountVectorizer(analyzer='word',ngram_range=(1, 2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(md['content'])

In [38]:
# calculate the cosine similarity between different contents. 
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [39]:
# reset the index to access records easily
md = md.reset_index()

In [40]:
# keep the title to get the index of the movie from its title.
titles = md['title']

# it's imoprtant to make the index of title is the index of our dataset.
indices = pd.Series(md.index, index=md['title'])

In [41]:
def get_content_base_recommendations(title):
    """get the recommendation based on the content of the movie.

    this function use the similarity score between our movie and other movies in our dataset,
    to get movies that are more similar to our movies, then it use the vote count and vote average to
    get red of ones with low vote count and low and to weight resulting movies to return them sorted.  

    Args:
        title (string): title of a movie to find similar ones to it.

    Returns:
        considered (pd.dataframe): dataframe has the similar movies to the input.
    """
        
    # get the index of our movie
    idx = indices[title]

    # build a list of tuple, each tuple has (index, sim) for our movie.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort according to similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get first 20 results.
    sim_scores = sim_scores[1:21]

    # get the indices of the similar movies. (the first element of the tuple)
    movie_indices = [i[0] for i in sim_scores]

    # get the movie record usign their indecis and keep just these cols.
    movies = md.iloc[movie_indices][['title', 'vote_count', 'vote_average']]

    # get vote_counts of the movies and get rid of movies that has no count.
    vote_counts = movies[~movies['vote_count'].isna()]['vote_count'].astype('int')

    # get average vote of the movies and get rid of movies that has no average vote.
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')

    # calculate the mean of the votes.
    average_vote = vote_averages.mean()

    # calculate the minimum_vote_count to get the top 40% movies.
    minimum_vote_count = vote_counts.quantile(0.60)

    # get movies with vote more that 
    considered = movies[(movies['vote_count'] >= minimum_vote_count) & 
                        (~ movies['vote_count'].isna()) & 
                        (~ movies['vote_average'].isna())]

    # order the movie according to weighted rating.

    # get the vote count and average of just the movie that we want to consider.
    considered['vote_count'] = considered['vote_count'].astype('int')
    considered['vote_average'] = considered['vote_average'].astype('int')

    # calculate weighted rating according to the previous formula.
    considered['weighted rating'] = considered.apply(weighted_rating, args=(minimum_vote_count, average_vote), axis=1)

    # sort the results descendingly.
    considered = considered.sort_values('weighted rating', ascending=False).head(10)
    
    return considered

In [42]:
get_content_base_recommendations('The Prestige')

Unnamed: 0,title,vote_count,vote_average,weighted rating
7648,Inception,14075,8,7.87335
6981,The Dark Knight,12269,8,7.856283
8613,Interstellar,11187,8,7.843661
3381,Memento,4168,8,7.636653
8031,The Dark Knight Rises,9263,7,6.923711
6218,Batman Begins,7511,7,6.908248
4145,Insomnia,1181,6,6.146889
8419,Man of Steel,6462,6,6.044753


# Collaborative Filtering Recommendation System.

The assumption of this approach is that people who have liked an item in the past will also like the same in future. This approach builds a model based on the past behaviour of users. The user behaviour may include previously watched videos, purchased items, given ratings on items. In this way, the model finds an association between the users and the items. The model is then used to predict the item or a rating for the item in which the user may be interested.

read the small rating dataset.

In [43]:
ratings = pd.read_csv('data/recmmendation_system/ratings_small.csv')

# let's find out what our dataset has.
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


The Singular Value Decomposition (SVD), a method from linear algebra that has been generally used as a dimensionality reduction technique in machine learning. SVD is a matrix factorisation technique, which reduces the number of features of a dataset by reducing the space dimension from N-dimension to K-dimension (where K<N).

The factorisation of this matrix is done by the singular value decomposition. It finds factors of matrices from the factorisation of a high-level (user-item-rating) matrix. The singular value decomposition is a method of decomposing a matrix into three other matrices as given below: 

$ A = U S V^T$


Where $A$ is a $m*n$ utility matrix, $U$ is a $m*r$ orthogonal left singular matrix, which represents the relationship between users and latent factors, $S$ is a $r*r$ diagonal matrix, which describes the strength of each latent factor and $V$ is a $r*n$ diagonal right singular matrix, which indicates the similarity between items and latent factors. The latent factors here are the characteristics of the items, for example, the genre of the movie. 

The SVD decreases the dimension of the utility matrix $A$ by extracting its latent factors. It maps each user and each item into a r-dimensional latent space. This mapping facilitates a clear representation of relationships between users and items.

it's easy to implement this algorithm, but I prefer to use surprise library as they can handle large datasets very well. Anyway, I will leave a snippet of the main code idea.

In [44]:
# # Create the rating matrix with rows as movies and columns as users and values are rating.
# movie_data = mini_data[['userId', 'movieId', 'rating']]
# ratings = np.ndarray( shape=(np.max(data.movieId.values), np.max(data.userId.values)), dtype=np.uint8)
# ratings[data.movieId.values-1, data.userId.values-1] = data.rating.values

# U, S, V = np.linalg.svd(ratings_mat)

In [45]:
# build a suprise dataset from our pandas dataframe.
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], Reader())

In [46]:
# Use the famous SVD algorithm
algo = SVD()

test the model.

In [47]:
# Run 5-fold cross-validation and then print results
res = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8940  0.8913  0.9025  0.9044  0.8951  0.8975  0.0051  
MAE (testset)     0.6880  0.6885  0.6925  0.6964  0.6917  0.6914  0.0030  
Fit time          5.48    5.48    5.49    6.67    7.26    6.08    0.75    
Test time         0.27    0.16    0.26    0.25    0.15    0.22    0.05    


In [48]:
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f4c58694820>

let's test the try the model on our dataset. 

In [49]:
user_id = 10

In [50]:
ratings[ratings['userId'] == user_id][:5]

Unnamed: 0,userId,movieId,rating,timestamp
744,10,50,5.0,942766420
745,10,152,4.0,942766793
746,10,318,4.0,942766515
747,10,344,3.0,942766603
748,10,345,4.0,942766603


In [51]:
movie_id = 152
print(f"The predicted rating is: {algo.predict(user_id, movie_id)[3] :2.2f}")

The predicted rating is: 3.77
