### Familiarization

We did this assignment with the tutorial from Datacamp :  
https://www.datacamp.com/community/tutorials/recommender-systems-python

In [1]:
# Import Pandas
import pandas as pd

# Load Movies Metadata
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

# Print the first three rows
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [2]:
# Calculate mean of vote average column
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


In [3]:
# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [4]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >=m]
q_movies.shape

(4555, 24)

In [5]:
metadata.shape

(45466, 24)

In [6]:
# Function that computes the weighted rating of each movie
def weighted_rating(x,m=m, C=C):
  v = x['vote_count']
  R = x['vote_average']
  # Calculation based on the IMDB formula
  return (v/(v+m) * R) + (m/(m+v) * C) 

In [7]:
# Define a new feature 'score' and calculate its value with 'weighted_rating()'
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [8]:
# Sort movies based on score calculated above
q_movies = q_movies.sort_values('score',ascending=False)

# Print the top 15 movies
q_movies[['title','vote_count','vote_average','score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


Content-Based Recommender

In [9]:
# Print plot overviews of the first 5 movies
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [10]:
#Print plot overviews of the first 5 movies
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [11]:
# Import TfIdifVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the','a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(45466, 75827)

In [12]:
# Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[5000:5010]

['avails',
 'avaks',
 'avalanche',
 'avalanches',
 'avallone',
 'avalon',
 'avant',
 'avanthika',
 'avanti',
 'avaracious']

**Step 1 : simple recommander** 

In [13]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

We commented the code above because if we run it, it will crash our notebook. The reason is that the matrix is too big and need too much memory. To avoid this problem, we will calculate the cosine similarity between the matrix and the first row of the matrix.

**Step2 : content based recommander**

In [14]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix[0])

In [15]:
print(tfidf_matrix.shape)
print(tfidf_matrix[0].shape)


(45466, 75827)
(1, 75827)


In [16]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [17]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [18]:
# Function that takes in movie title as input and outputs most similar movies
import numpy as np
def get_recommendations(title):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix[idx])
    sim_scores = list(enumerate(cosine_sim))
  
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 30 most similar movies
    sim_scores = sim_scores[0:30]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Get a popularity filter
    sim_movies=metadata.copy().iloc[movie_indices]
    sim_movies2=sim_movies.copy().loc[sim_movies['vote_count']>=m]

    sim_movies2['score']=sim_movies2.apply(weighted_rating,axis=1)
    # Return the top 10 most similar movies
    return sim_movies2[['title','score']].sort_values(by='score', ascending=False).head(10)




In [19]:
get_recommendations('The Dark Knight Rises')

Unnamed: 0,title,score
12481,The Dark Knight,8.265477
18252,The Dark Knight Rises,7.56635
10122,Batman Begins,7.46075
20232,"Batman: The Dark Knight Returns, Part 2",7.276985
19792,"Batman: The Dark Knight Returns, Part 1",7.115637
15511,Batman: Under the Red Hood,7.087743
41976,The Lego Batman Movie,7.045017
585,Batman,6.904084
3095,Batman: Mask of the Phantasm,6.645802
18035,Batman: Year One,6.528706


In [20]:
get_recommendations('The Godfather')

Unnamed: 0,title,score
834,The Godfather,8.425439
1178,The Godfather: Part II,8.180076
1914,The Godfather: Part III,6.964444
41491,Live by Night,6.038391
23126,Blood Ties,5.817651


**Step 3 : Improvement**

We use the PCA for this step

In [21]:
from sklearn.decomposition import TruncatedSVD
svdT = TruncatedSVD(n_components=100,n_iter=15,random_state=42)
svdTFit = svdT.fit_transform(tfidf_matrix)

In [22]:
svdTFit.shape

(45466, 100)

In [23]:
print(f"Total variance explained: {np.sum(svdT.explained_variance_ratio_):.2f}")

Total variance explained: 0.08


In [24]:
svdTFit[0].reshape(1,-1).shape

(1, 100)

In [25]:
# Function that takes in movie title as input and outputs most similar movies
import numpy as np
def get_recommendations2(title):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    cosine_sim = cosine_similarity(svdTFit,svdTFit[idx].reshape(1,-1))
    sim_scores = list(enumerate(cosine_sim))
  
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]




In [26]:
get_recommendations2('The Godfather')

44030    The Godfather Trilogy: 1972-1990
15716                        Urban Menace
30487      Bonnie and Clyde Italian Style
26293                  Beck 28 - Familjen
28161                  Counselor at Crime
18322                     The Outside Man
26176                    Samurai Vendetta
11489               Don't Drink the Water
22897                         Family tree
17170                               Wanda
Name: title, dtype: object