# Simple Recommenders

In [1]:
import pandas as pd

metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [2]:
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


In [3]:
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [4]:
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(4555, 24)

In [5]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [6]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [7]:
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


# Content-Based Recommender

In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(45466, 75827)

In [9]:
tfidf.get_feature_names()[5000:5010]

['avails',
 'avaks',
 'avalanche',
 'avalanches',
 'avallone',
 'avalon',
 'avant',
 'avanthika',
 'avanti',
 'avaracious']

In [10]:
from sklearn.metrics.pairwise import linear_kernel

# Trying to reduce features using TruncatedSVD -- Does not work
# from sklearn.decomposition import TruncatedSVD
# tsvd = TruncatedSVD(1000)
# datatr = tsvd.fit_transform(tfidf_matrix)
# cosine_sim = linear_kernel(datatr, datatr)

# Compute the cosine similarity matrix -- crashes the kernel
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [11]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [77]:
from time import time
from sklearn.decomposition import TruncatedSVD

def get_recommendations(title, sizem=10, reduce=0):
    # Get the index of the movie that matches the title
    idx = indices[title]
    cosine_sim = 0
    
    if reduce > 0 :
        starttsvd = time()
        tsvd = TruncatedSVD(reduce, random_state=42)
        datatr = tsvd.fit_transform(tfidf_matrix)
        endtsvd = time() - starttsvd
        print("TruncatedSVD time:", endtsvd)
        cosine_sim = linear_kernel(datatr, datatr[idx].reshape(1, -1))
    else:
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix[idx])
    sim_scores = list(enumerate(cosine_sim))


    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 'sizem' most similar movies
    sim_scores = sim_scores[1:sizem+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

Testing without PCA:

In [30]:
start = time()
res = get_recommendations('GoldenEye')
end = time() - start
print("time: ",  end)
res

for i,j in enumerate(res):
    print(i+1, j)


time:  0.29311037063598633
1 Licence to Kill
2 Dream Work
3 Live and Let Die
4 Octopussy
5 You Only Live Twice
6 Doctor X
7 Never Say Never Again
8 Johnny Stool Pigeon
9 Casino Royale
10 猛龍過江


Testing with PCA (TruncatedSVD set to 500):

In [17]:
start = time()
res = get_recommendations('GoldenEye', reduce=500)
end = time() - start
print("time: ",  end)
print(res)

TruncatedSVD time: 67.66810345649719
time:  68.3633680343628
2874                                       Licence to Kill
2875                                      Live and Let Die
7329                                   You Only Live Twice
7330                                             Octopussy
5658                                         Casino Royale
22657                                     Beyond Loch Ness
5798     Come Back to the 5 & Dime, Jimmy Dean, Jimmy Dean
7333                                 Never Say Never Again
7057                                 My Darling Clementine
12109    The Assassination of Jesse James by the Coward...
Name: title, dtype: object


In [78]:
def weighted_recommendations(film):
    filmscores = {}
    scores = []
    result = {}
    m = metadata['vote_count'].quantile(0.90)
    C = metadata['vote_average'].mean()

    for i, j in enumerate(get_recommendations(film, sizem=30)):
        x = metadata[metadata['original_title'] == j]
        v = x['vote_count']
        R = x['vote_average']
        score = (v/(v+m) * R) + (m/(m+v) * C)
        for v, w in enumerate(score):
            filmscores[j] = w
            scores.append(w)
    
    scores.sort()
    scores.reverse()
    for sc in scores:
        for key in filmscores:
            if filmscores[key] == sc:
                result[key] = sc
        if len(result) == 10:
            break
            
    return result

In [82]:
weighted_recommendations('You Only Live Twice')

{'Casino Royale': 7.234208595213073,
 'Dr. No': 6.715735089327467,
 'From Russia with Love': 6.6801855888761725,
 'The Spy Who Loved Me': 6.367278747291065,
 "On Her Majesty's Secret Service": 6.273899285931843,
 'Live and Let Die': 6.221304506316386,
 'Diamonds Are Forever': 6.148910186179321,
 'For Your Eyes Only': 6.133962183289908,
 'Octopussy': 6.065869098590014,
 'Capricorn One': 5.974534538848536}

In [83]:
get_recommendations('You Only Live Twice')

3859                Diamonds Are Forever
2875                    Live and Let Die
7330                           Octopussy
3511     On Her Majesty's Secret Service
7333               Never Say Never Again
2833                              Dr. No
5658                       Casino Royale
43970          To The Stars By Hard Ways
3513                The Spy Who Loved Me
3516                           Moonraker
Name: title, dtype: object

# Credits, Genres, and Keywords Based Recommender