In [4]:
import pandas as pd
import numpy as np
import pickle

In [5]:
df = pd.read_csv("./data/meta_final.csv")

In [6]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords,director,soup
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862,"['tomhanks', 'timallen', 'donrickles']","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","['jealousy', 'toy', 'boy']",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844,"['robinwilliams', 'jonathanhyde', 'kirstendunst']","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","['boardgame', 'disappearance', ""basedonchildre...",joejohnston,boardgame disappearance basedonchildren'sbook ...
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602,"['waltermatthau', 'jacklemmon', 'ann-margret']","[{'credit_id': '52fe466a9251416c75077a89', 'de...","['fishing', 'bestfriend', 'duringcreditsstinger']",howarddeutch,fishing bestfriend duringcreditsstinger walter...
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357,"['whitneyhouston', 'angelabassett', 'lorettade...","[{'credit_id': '52fe44779251416c91011acb', 'de...","['basedonnovel', 'interracialrelationship', 's...",forestwhitaker,basedonnovel interracialrelationship singlemot...
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862,"['stevemartin', 'dianekeaton', 'martinshort']","[{'credit_id': '52fe44959251416c75039ed7', 'de...","['baby', 'midlifecrisis', 'confidence']",charlesshyer,baby midlifecrisis confidence stevemartin dian...


In [7]:
#Only consider movies longer than 45 minutes and shorter than 300 minutes
df_small = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]

#Only consider movies that have garnered more than m votes
df_small = df_small[df_small['vote_count'] >= 50]

In [8]:
df_small.shape

(9105, 13)

In [9]:
df_small.reset_index(drop=True, inplace=True)

In [10]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Define a new CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_small['soup'])

In [11]:
indices = pd.Series(df_small.index, index=df_small['title'])

In [12]:
#Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [13]:
cosine_sim.shape

(9105, 9105)

In [16]:
#Build the SVD based Collaborative filter
from surprise import SVD, Reader, Dataset

reader = Reader()
ratings = pd.read_csv('./data/ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22ba6c082e8>

In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [18]:
svd.predict(1, 318).est

3.5584847491759173

In [19]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [20]:
def hybrid(userId, title):
    #Extract the cosine_sim index of the movie
    idx = indices[title]

    #Extract the similarity scores and their corresponding index for every movie from the cosine_sim matrix
    sim_scores = list(enumerate(cosine_sim[idx]))

    #Sort the (index, score) tuples in decreasing order of similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    #Select the top 25 tuples, excluding the first
    #(as it is the similarity score of the movie with itself)
    sim_scores = sim_scores[1:26]

    #Store the cosine_sim indices of the top 25 movies in a list
    movie_indices = [i[0] for i in sim_scores]
    
    #Extract the metadata of the aforementioned movies
    movies = df_small.iloc[movie_indices][['title', 'id']]

    #Compute the predicted ratings using the SVD filter
    movies['est'] = [svd.predict(userId, movies['id'].iloc[i]).est for i in range(movies.shape[0])]
    
    #Sort the movies in decreasing order of predicted rating
    movies.sort_values('est', ascending=False, inplace=True)
    
    #Return the top 10 movies as recommendations
    return movies



In [21]:
hybrid(1, 'The Terminator')

Unnamed: 0,title,id,est
5143,Terminator Salvation,534,3.351536
2938,Terminator 3: Rise of the Machines,296,3.297817
4500,Sunshine,1272,3.267622
2454,Impostor,4965,3.134484
318,Terminator 2: Judgment Day,280,3.111394
3079,The Matrix Revolutions,605,2.814478
904,Armageddon,95,2.758907
3480,"I, Robot",2048,2.742914
575,Aliens,679,2.721447
1845,The Running Man,865,2.72092


In [22]:
hybrid(2, 'The Terminator')

Unnamed: 0,title,id,est
2938,Terminator 3: Rise of the Machines,296,3.978498
5143,Terminator Salvation,534,3.977032
318,Terminator 2: Judgment Day,280,3.966424
4500,Sunshine,1272,3.940828
2454,Impostor,4965,3.724153
5088,Far Cry,7916,3.654515
3480,"I, Robot",2048,3.582317
1310,The Matrix,603,3.533383
6097,Arena,71254,3.532521
1845,The Running Man,865,3.51957


In [54]:
# save indices to file
indices.to_csv('./models/indices.csv', index=False)

  """Entry point for launching an IPython kernel.


In [61]:
np.savetxt("./models/cosine.csv", cosine_sim, delimiter=",")

In [24]:
with open("svd.pkl", "wb") as file:
    pickle.dump(svd, file)