In [33]:
import numpy as np
import pandas as pd
import surprise
from surprise import Reader, Dataset, SVD
import re

### Import Dataset

In [14]:
movies = pd.read_csv('combined_metadata_table.csv')
movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics,id,overview,popularity,revenue,tagline
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,12/26/06,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,6.1,537,,7.0,7.0,20105,Just as Fritz Lang’s Metropolis (1927) is test...,0.290549,0.0,"The Most Sensational, The Most Thrilling and I..."
1,tt0002130,L'Inferno,L'Inferno,1911,3/6/11,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,7.0,2019,,28.0,14.0,70512,Loosely adapted from Dante's Divine Comedy and...,0.801412,0.0,
2,tt0002101,Cleopatra,Cleopatra,1912,11/13/12,"Drama, History",100,USA,English,Charles L. Gaskill,...,5.2,420,,24.0,3.0,71266,The fabled queen of Egypt's affair with Roman ...,0.142542,0.0,
3,tt0002461,Richard III,Richard III,1912,10/15/12,Drama,55,"France, USA",English,"André Calmettes, James Keane",...,5.5,211,,7.0,1.0,46758,Shakespeare's tragedy of the hump-backed Duke ...,0.05131,0.0,
4,tt0003471,Traffic in Souls,Traffic in Souls,1913,11/24/13,"Crime, Drama",88,USA,English,George Loane Tucker,...,6.1,527,,13.0,10.0,96128,"A woman, with the aid of her police officer sw...",0.113363,0.0,


In [34]:
# Only picked a subset to speed up computation
ratings = pd.read_csv('ratings.csv',nrows=100000)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [57]:
# Merge to help visualize
source = pd.merge(ratings[['userId','rating','movieId']], movies[['imdb_title_id', 'title','id','year']], 
                  how='inner', left_on='movieId', right_on='id')
source = source.drop(['movieId', 'id'], axis=1)

In [58]:
source.describe()

Unnamed: 0,userId,rating,year
count,40999.0,40999.0,40999.0
mean,523.889656,3.524501,1987.096417
std,287.793614,1.027681,21.012805
min,1.0,0.5,1914.0
25%,294.0,3.0,1974.0
50%,524.0,3.5,1995.0
75%,775.0,4.0,2003.0
max,1014.0,5.0,2015.0


In [59]:
source = source.drop(['year'], axis=1)
source.head(5)

Unnamed: 0,userId,rating,imdb_title_id,title
0,1,1.0,tt0111495,Trois couleurs: Rouge
1,11,3.5,tt0111495,Trois couleurs: Rouge
2,22,5.0,tt0111495,Trois couleurs: Rouge
3,24,5.0,tt0111495,Trois couleurs: Rouge
4,29,3.0,tt0111495,Trois couleurs: Rouge


### Find Your Movie

In [40]:
def identify_movie(your_pick, whole_df):
    return whole_df[whole_df['title'].str.contains(your_pick, flags=re.IGNORECASE, regex=True)]

In [48]:
your_pick = 'Toy story'
identify_movie(your_pick, source).groupby(['imdb_title_id','title']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,userId,rating
imdb_title_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1


### Create Dummy User Ratings (Ashish)

In [17]:
new_user = {'userId':[-1,-1,-1,-1],
            'rating':[5,4,4,2], 
            'imdb_title_id':['tt0095016', 'tt0099423','tt0349903','tt0117705'], 
            'title':['Die Hard','Die Hard 2','Ocean\'s Twelve','Space Jam']}
new_user_df = pd.DataFrame(new_user)
source=source.append(new_user_df)
new_user_df

Unnamed: 0,userId,rating,imdb_title_id,title
0,-1,5,tt0095016,Die Hard
1,-1,4,tt0099423,Die Hard 2
2,-1,4,tt0349903,Ocean's Twelve
3,-1,2,tt0117705,Space Jam


In [None]:
new_user = {'userId':[-1,-1,-1,-1],
            'rating':[5,4,4,2], 
            'imdb_title_id':['tt0095016', 'tt0099423','tt0349903','tt0117705'], 
            'title':['Die Hard','Die Hard 2','Ocean\'s Twelve','Space Jam']}
new_user_df = pd.DataFrame(new_user)
source=source.append(new_user_df)
new_user_df

In [50]:
# Pick a movie below (optimize later)
'''
for name in source['title'].unique():
    print(name)
'''

"\nfor name in source['title'].unique():\n    print(name)\n"

### Collaborative Filtering Using SVD

In [19]:
# Set up SVD
reader = Reader()
data = Dataset.load_from_df(source[['userId', 'imdb_title_id', 'rating']], reader)
svd = SVD()

In [20]:
# Train model
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12e26a6a0>

In [21]:
# Check user's ratings
source[source['userId'] == -1]

Unnamed: 0,userId,rating,imdb_title_id,title
0,-1,5.0,tt0095016,Die Hard
1,-1,4.0,tt0099423,Die Hard 2
2,-1,4.0,tt0349903,Ocean's Twelve
3,-1,2.0,tt0117705,Space Jam


### Give Recommendations

In [22]:
ratings_list = []
movie_titles = []
movie_ids = []
for ids in source['imdb_title_id'].unique():
    ratings_list.append(svd.predict(-1, ids, 3)[3])
    movie_ids.append(ids)
    movie_titles.append(list(source[source['imdb_title_id'] == ids]['title'])[0])

In [26]:
# Sort ratings
result = {'IMDBid':movie_ids,'Ratings':ratings_list,'Title':movie_titles}
result_df = pd.DataFrame(result)
result_df = result_df.sort_values(by=['Ratings'],ascending=False)

In [28]:
result_df.head(10)

Unnamed: 0,IMDBid,Ratings,Title
311,tt0281820,4.532614,The Good Thief
224,tt0061512,4.487076,Cool Hand Luke
2,tt0108160,4.478551,Sleepless in Seattle
328,tt0363547,4.46069,Dawn of the Dead
903,tt0367959,4.455774,Hannibal Rising
231,tt0134119,4.443424,The Talented Mr. Ripley
62,tt0120753,4.421599,The Million Dollar Hotel
1380,tt0063240,4.418351,The Lost Continent
1090,tt0498380,4.411239,Letters from Iwo Jima
146,tt0069293,4.392198,Solaris


### Next Week

In [13]:
# The runtime: Scale to larger rating set
# Create a function that takes 5 movies and give recomendations
# Try to figure out a way to assess the quality of the recomendations
# Implement year range
# Try include ratings for more movies