In [58]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate
from surprise.model_selection import cross_validate

In [70]:
m = pd.read_csv('movies_metadata.csv')
m.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [11]:
m['genres'].fillna('[]').apply(literal_eval)[0]

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [12]:
m['genres'] = m['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [14]:
m['genres'][:2]

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
Name: genres, dtype: object

In [16]:
m['year'] = pd.to_datetime(m['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [22]:
M = m[m['vote_count'].notnull()]['vote_count'].astype('int').quantile(0.95)
C = m[m['vote_average'].notnull()]['vote_average'].astype('int').mean()

In [20]:
M

434.0

In [23]:
C

5.618207215134184

# General recommendation - IMDB weighted rating score 

In [24]:
narrowed = m[(m['vote_count'] >= C) & (m['vote_count'].notnull()) & (m['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
narrowed['vote_count'] = narrowed['vote_count'].astype('int')
narrowed['vote_average'] = narrowed['vote_average'].astype('int')
narrowed.shape

(28801, 6)

In [25]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+M) * R) + (M/(M+v) * C)

In [104]:
narrowed['weighted_rating'] = narrowed.apply(weighted_rating, axis=1)
narrowed = narrowed.sort_values('weighted_rating', ascending=False)

In [105]:
narrowed.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


# Recommend from one genre

In [106]:
s = m.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'

In [107]:
m_genre = m.drop('genres', axis=1).join(s)

In [108]:
def recom_genre(genre, percentile=0.85):
    df = m_genre[m_genre['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [109]:
recom_genre('Comedy').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.463024
351,Forrest Gump,1994,8147,8,48.3072,7.963363
1225,Back to the Future,1985,6239,8,25.7785,7.952358
18465,The Intouchables,2011,5410,8,16.0869,7.945207
22841,The Grand Budapest Hotel,2014,4644,8,14.442,7.936384
2211,Life Is Beautiful,1997,3643,8,39.395,7.91943
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964,1472,8,9.80398,7.809073
3342,Modern Times,1936,881,8,8.15956,7.695554
883,Some Like It Hot,1959,835,8,11.8451,7.680781
1236,The Great Dictator,1940,756,8,9.24175,7.651762


# Content based recommendation - vector embedding

In [34]:
links_small = pd.read_csv('links_small.csv')

In [35]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [83]:
m['id'][:5]

0      862
1     8844
2    15602
3    31357
4    11862
Name: id, dtype: object

In [84]:
m[m['id'].apply(len) > 8]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [85]:
m = m.drop([19730, 29503, 35587])
m['id'] = m['id'].astype('int')

In [101]:
m_link = m[m['id'].isin(links_small)]
m_link.shape

(9099, 24)

In [102]:
m_link['tagline'] = m_link['tagline'].fillna('')
m_link['description'] = m_link['overview'] + m_link['tagline']
m_link['description'] = m_link['description'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [103]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(m_link['description'])
tfidf_matrix.shape

(9099, 268124)

In [42]:
tfidf_matrix

<9099x268124 sparse matrix of type '<class 'numpy.float64'>'
	with 540591 stored elements in Compressed Sparse Row format>

In [43]:
# dot product 
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [44]:
cosine_sim

array([[1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
        0.        ],
       [0.00680476, 1.        , 0.01531062, ..., 0.00357057, 0.00762326,
        0.        ],
       [0.        , 0.01531062, 1.        , ..., 0.        , 0.00286535,
        0.00472155],
       ...,
       [0.        , 0.00357057, 0.        , ..., 1.        , 0.07811616,
        0.        ],
       [0.00344913, 0.00762326, 0.00286535, ..., 0.07811616, 1.        ,
        0.        ],
       [0.        , 0.        , 0.00472155, ..., 0.        , 0.        ,
        1.        ]])

In [104]:
m_link = m_link.reset_index()
indices = pd.Series(m_link.index, index=m_link['title'])

In [105]:
m_link.head(2)

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ..."
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,When siblings Judy and Peter discover an encha...


In [107]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:30]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [108]:
get_recommendations('The Godfather')

973            The Godfather: Part II
8387                       The Family
3509                             Made
4196               Johnny Dangerously
29                     Shanghai Triad
5667                             Fury
2412                   American Movie
1582          The Godfather: Part III
4221                          8 Women
2159                    Summer of Sam
618                           Thinner
3609                    Harlem Nights
8816                    Run All Night
3288                Jaws: The Revenge
2192                 The Color Purple
5406                  The Kid Brother
3715                         3 Ninjas
7657                The Tillman Story
3607                  Family Business
6398                      Renaissance
7591                          Machete
7760                    Henry's Crime
5593                           Eulogy
227                    The Jerky Boys
3560                Moon Over Parador
8931                     Afro Samurai
5271        

# Collaborative recommendation - SVD

In [51]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [52]:
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [54]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9083  0.8909  0.8925  0.8980  0.8950  0.8969  0.0061  
MAE (testset)     0.6994  0.6857  0.6870  0.6939  0.6890  0.6910  0.0050  
Fit time          4.51    4.61    4.31    4.25    4.26    4.39    0.15    
Test time         0.15    0.14    0.12    0.12    0.19    0.14    0.02    


{'test_rmse': array([0.90825953, 0.89091681, 0.89253911, 0.8979593 , 0.89503328]),
 'test_mae': array([0.69938583, 0.6856617 , 0.68703425, 0.69391716, 0.68902903]),
 'fit_time': (4.514296770095825,
  4.60882306098938,
  4.314519882202148,
  4.24683690071106,
  4.255422115325928),
 'test_time': (0.14603209495544434,
  0.14206218719482422,
  0.11700320243835449,
  0.11931991577148438,
  0.186323881149292)}

In [55]:
trainset = data.build_full_trainset()

In [60]:
algo.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a249041d0>

In [69]:
# validate on prediction of userid 23
ratings[ratings['userId'] == 23].head(5)

Unnamed: 0,userId,movieId,rating,timestamp
4008,23,1,3.0,1148729853
4009,23,6,3.5,1148730128
4010,23,11,3.5,1166728170
4011,23,16,4.0,1148672550
4012,23,19,2.0,1148669114


In [66]:
algo.predict(23, 1)

Prediction(uid=23, iid=1, r_ui=None, est=3.5503451802734878, details={'was_impossible': False})

In [68]:
algo.predict(23, 6)

Prediction(uid=23, iid=6, r_ui=None, est=3.757104313350674, details={'was_impossible': False})

In [None]:
# kinda close!