In [311]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline


In [312]:
# read movie metadata
movie_md = pd.read_csv('./data/movies_metadata.csv', low_memory=False)


In [313]:
df = movie_md.copy()
df.shape


(45466, 24)

In [314]:
df.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [315]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [316]:
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [317]:
from ast import literal_eval
def process_genres(df):
    '''
    process genres column using liter_eval to convert string to python object
    extract the genres info
    '''
    df['genres'] = df['genres'].fillna('[]').apply(literal_eval)
    # extract the 'name' value of genres
    df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x])
    
print('before processing genres: \n', df['genres'][0])
process_genres(df)
print('after processeding genres: \n', df['genres'][0])


before processing genres: 
 [{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
after processeding genres: 
 ['Animation', 'Comedy', 'Family']


In [318]:
# process release_date to show year only
df['year'] = df['release_date'].apply(lambda x: str(x)[:4] if x != 'NaN' else np.nan)
print('the release_data column:')
print(df['release_date'].head())
print('the year column:')
print(df['year'].head())

the release_data column:
0    1995-10-30
1    1995-12-15
2    1995-12-22
3    1995-12-22
4    1995-02-10
Name: release_date, dtype: object
the year column:
0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object


In [319]:
def process_vote(df, vote_count_cutoff_percentile=0.95):
    '''
    calculate weighted rating instead of row rating
    weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

    Where: 
    R = average for the movie (mean) = (rating)
    v = number of votes for the movie = (votes)
    m = minimum votes required to be listed in the Top Rated list (default: 95 percentile of vote_count)
    C = the mean vote across the whole report
    '''
    df = df.dropna(subset=['vote_count', 'vote_average'])
    df.vote_count = df.vote_count.astype('int')
    df.vote_average = df.vote_average.astype('int')
    
    mean_vote_average = df.vote_average.mean()
    vote_count_cutoff = df.vote_count.quantile(vote_count_cutoff_percentile)
    df = df.loc[df['vote_count'] >= vote_count_cutoff]
    df['weighted_rating'] = (df.vote_average * df.vote_count/(df.vote_count + vote_count_cutoff)) + \
                            (mean_vote_average * vote_count_cutoff/(df.vote_count + vote_count_cutoff))
    df = df.sort_values('weighted_rating', ascending=False)
    return df

In [320]:
df = process_vote(df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [321]:
FEATURES = ['title', 'year', 'vote_count', 'vote_average', 'popularity', \
            'genres', 'weighted_rating', 'tagline', 'overview', 'id']
df = df[FEATURES]
df.shape

(2274, 10)

In [322]:
def get_tops_by_genres(df, *genres, intersect=True, top=10):
    if not genres:
        return df.head(top)
    elif not intersect:
        return df[df['genres'].apply(lambda x: not set(genres).isdisjoint(x))].head(top)
    else:
        return df[df['genres'].apply(lambda x: set(genres).issubset(x))].head(top)

In [323]:
get_tops_by_genres(df, 'Family').head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
1225,Back to the Future,1985,6239,8,25.778509,"[Adventure, Comedy, Science Fiction, Family]",7.820813,He's the only kid ever to get into trouble bef...,Eighties teenager Marty McFly is accidentally ...,105
359,The Lion King,1994,5520,8,21.605761,"[Family, Animation, Drama]",7.799175,Life's greatest adventure is finding your plac...,A young lion cub named Simba can't wait to be ...,8587
5481,Spirited Away,2001,3968,8,41.048867,"[Fantasy, Adventure, Animation, Family]",7.72837,The tunnel led Chihiro to a mysterious town...,A ten year old girl who wanders away from her ...,129
5833,My Neighbor Totoro,1988,1730,8,13.507299,"[Fantasy, Animation, Family]",7.447452,These strange creatures still exist in Japan. ...,Two sisters move to the country with their fat...,8392
926,It's a Wonderful Life,1946,1103,8,15.031588,"[Drama, Family, Fantasy]",7.222046,It's a wonderful laugh! It's a wonderful love!,George Bailey has spent his entire life giving...,1585


In [324]:
get_tops_by_genres(df, 'Family', 'Animation').head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
359,The Lion King,1994,5520,8,21.605761,"[Family, Animation, Drama]",7.799175,Life's greatest adventure is finding your plac...,A young lion cub named Simba can't wait to be ...,8587
5481,Spirited Away,2001,3968,8,41.048867,"[Fantasy, Adventure, Animation, Family]",7.72837,The tunnel led Chihiro to a mysterious town...,A ten year old girl who wanders away from her ...,129
5833,My Neighbor Totoro,1988,1730,8,13.507299,"[Fantasy, Animation, Family]",7.447452,These strange creatures still exist in Japan. ...,Two sisters move to the country with their fat...,8392
19901,Paperman,2012,734,8,7.198633,"[Animation, Family, Romance]",6.976272,"Delicate, charming and sweet.",An urban office worker finds that paper airpla...,140420
13724,Up,2009,7048,7,19.330884,"[Animation, Comedy, Family, Adventure]",6.898194,,Carl Fredricksen spent his entire life dreamin...,14160


In [325]:
get_tops_by_genres(df, 'Family', 'Animation', intersect=True).head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
359,The Lion King,1994,5520,8,21.605761,"[Family, Animation, Drama]",7.799175,Life's greatest adventure is finding your plac...,A young lion cub named Simba can't wait to be ...,8587
5481,Spirited Away,2001,3968,8,41.048867,"[Fantasy, Adventure, Animation, Family]",7.72837,The tunnel led Chihiro to a mysterious town...,A ten year old girl who wanders away from her ...,129
5833,My Neighbor Totoro,1988,1730,8,13.507299,"[Fantasy, Animation, Family]",7.447452,These strange creatures still exist in Japan. ...,Two sisters move to the country with their fat...,8392
19901,Paperman,2012,734,8,7.198633,"[Animation, Family, Romance]",6.976272,"Delicate, charming and sweet.",An urban office worker finds that paper airpla...,140420
13724,Up,2009,7048,7,19.330884,"[Animation, Comedy, Family, Adventure]",6.898194,,Carl Fredricksen spent his entire life dreamin...,14160


In [326]:
def get_tops_by_year(df, year, top=10):
    return df[df.year == str(year)].head(top)

In [327]:
get_tops_by_year(df,2013).head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
22131,The Wolf of Wall Street,2013,6768,7,16.382422,"[Crime, Drama, Comedy]",6.894236,EARN. SPEND. PARTY.,A New York stockbroker refuses to cooperate in...,106646
22058,The Hunger Games: Catching Fire,2013,6656,7,25.309139,"[Adventure, Action, Science Fiction]",6.892565,Every revolution begins with a spark.,Katniss Everdeen has returned home safe after ...,101299
21592,Gravity,2013,5879,7,18.50194,"[Science Fiction, Thriller, Drama]",6.879342,Don't Let Go,"Dr. Ryan Stone, a brilliant medical engineer o...",49047
21025,Now You See Me,2013,5635,7,17.852022,"[Thriller, Crime]",6.874491,4 amazing magicians. 3 impossible heists. 1 bi...,An FBI agent and an Interpol detective track a...,75656
22110,Frozen,2013,5440,7,24.248243,"[Animation, Adventure, Family]",6.870324,Only the act of true love will thaw a frozen h...,Young princess Anna of Arendelle dreams about ...,109445


In [328]:
df.columns

Index(['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres',
       'weighted_rating', 'tagline', 'overview', 'id'],
      dtype='object')

In [329]:
# Content-based recommendation
df.tagline.fillna('', inplace=True)
df.overview.fillna('', inplace=True)

In [330]:
df['description'] = df.overview + df.tagline
df['description'].head(1).values

array(['Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person\'s idea into a target\'s subconscious.Your mind is the scene of the crime.'],
      dtype=object)

In [331]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
def cal_similarity_matrix(data):    
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(data)
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [332]:
cosine_sim_description = cal_similarity_matrix(df['description'])


In [333]:
title_to_idx = {title: idx for title, idx in zip(df.title, range(df.shape[0]))}
title_to_idx['Inception']

0

In [334]:
def get_recommendation_by_title(df, title, cosine_sim, top=10):
    idx = title_to_idx[title]
    scores = sorted(list(enumerate(cosine_sim[idx])), key=lambda x: x[1], reverse=True)
    movie_indices = list(map(lambda x: x[0], scores[1:top+1]))
    return df.iloc[movie_indices].sort_values('weighted_rating', ascending=False)
    

In [335]:
get_recommendation_by_title(df, 'Toy Story', cosine_sim_description).head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id,description
15348,Toy Story 3,2010,4710,7,16.96647,"[Animation, Family, Comedy]",6.851922,No toy gets left behind.,"Woody, Buzz, and the rest of Andy's toys haven...",10193,"Woody, Buzz, and the rest of Andy's toys haven..."
2997,Toy Story 2,1999,3914,7,17.547693,"[Animation, Comedy, Family]",6.824813,The toys are back!,"Andy heads off to Cowboy Camp, leaving his toy...",863,"Andy heads off to Cowboy Camp, leaving his toy..."
11007,The Devil Wears Prada,2006,3198,7,13.102384,"[Comedy, Drama, Romance]",6.790277,Meet Andy Sachs. A million girls would kill to...,The Devil Wears Prada is about a young journal...,350,The Devil Wears Prada is about a young journal...
10585,Match Point,2005,1134,7,9.020372,"[Drama, Thriller, Crime, Romance]",6.514212,Passion Temptation Obsession,Match Point is Woody Allen’s satire of the Bri...,116,Match Point is Woody Allen’s satire of the Bri...
1199,Manhattan,1979,600,7,12.050759,"[Comedy, Drama, Romance]",6.263332,Woody Allen's New Comedy Hit,The life of a divorced television writer datin...,696,The life of a divorced television writer datin...


In [336]:
get_recommendation_by_title(df, 'The Dark Knight', cosine_sim_description).head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id,description
18252,The Dark Knight Rises,2012,9263,7,20.58258,"[Action, Crime, Drama, Thriller]",6.921448,The Legend Ends,Following the death of District Attorney Harve...,49026,Following the death of District Attorney Harve...
18258,Sherlock Holmes: A Game of Shadows,2011,3971,7,18.695329,"[Adventure, Action, Crime, Mystery]",6.827079,The game is afoot.,There is a new criminal mastermind at large (P...,58574,There is a new criminal mastermind at large (P...
585,Batman,1989,2145,7,19.10673,"[Fantasy, Action]",6.704647,Have you ever danced with the devil in the pal...,The Dark Knight of Gotham City begins his war ...,268,The Dark Knight of Gotham City begins his war ...
14301,Law Abiding Citizen,2009,1522,7,16.639047,"[Drama, Crime, Thriller]",6.610575,The System Must Pay.,A frustrated man decides to take justice into ...,22803,A frustrated man decides to take justice into ...
41976,The Lego Batman Movie,2017,1473,7,17.070748,"[Action, Animation, Comedy, Family, Fantasy]",6.600569,Always be yourself. Unless you can be Batman.,In the irreverent spirit of fun that made “The...,324849,In the irreverent spirit of fun that made “The...


In [337]:
# add more features to calculate similarity between movies
credits = pd.read_csv('./data/credits.csv')
keywords = pd.read_csv('./data/keywords.csv')

# remove duplicates
credits = credits.drop_duplicates(subset='id')
keywords = keywords.drop_duplicates(subset='id')

In [338]:
df['id'] = df['id'].astype(int)
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')

In [339]:
# extract names of the top 3 actors from the cast column
df['cast'] = df['cast'].fillna('[]').apply(lambda x: [str.lower(i['name'].replace(" ","")) for i in literal_eval(x)][:3])

In [340]:
def get_director(x):
    '''
    extract director's name from crew column of x
    '''
    for i in x:
        if i['job'] == 'Director':
            return str.lower(i['name'].replace(' ', ''))
    return np.nan


In [341]:
df['director'] = df['crew'].fillna('[]').apply(lambda x: get_director(literal_eval(x)))

In [342]:
# process keywords
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: [str.lower(stemmer.stem(i['name'].replace(' ', ''))) for i in literal_eval(x)])

In [344]:
df.head()[['cast', 'director', 'keywords']]

Unnamed: 0,cast,director,keywords
0,"[leonardodicaprio, josephgordon-levitt, ellenp...",christophernolan,"[lossoflov, dream, kidnap, sleep, subconsci, h..."
1,"[christianbale, michaelcaine, heathledger]",christophernolan,"[dccomic, crimefight, secretident, scarecrow, ..."
2,"[matthewmcconaughey, jessicachastain, annehath...",christophernolan,"[savingtheworld, artificialintellig, fatherson..."
3,"[edwardnorton, bradpitt, meatloaf]",davidfincher,"[supportgroup, dualident, nihil, rageandh, ins..."
4,"[elijahwood, ianmckellen, cateblanchett]",peterjackson,"[elv, dwarv, orc, middle-earth(tolkien), hobbi..."


In [346]:
# use title, genres, top 3 actors, director (* 3 to make director a more significant factor), 
# and keyword to calculate similarity
df['mixed_credits'] = df['title'].apply(lambda x: [x]) + df['genres'] + df['cast'] + df['director'].apply(lambda x: [x]) * 3 + df['keywords']
print(df['mixed_credits'].head())
df['mixed_credits'] = df['mixed_credits'].apply(lambda x: ' '.join(x))
print(df['mixed_credits'].head())

0    [Inception, Action, Thriller, Science Fiction,...
1    [The Dark Knight, Drama, Action, Crime, Thrill...
2    [Interstellar, Adventure, Drama, Science Ficti...
3    [Fight Club, Drama, edwardnorton, bradpitt, me...
4    [The Lord of the Rings: The Fellowship of the ...
Name: mixed_credits, dtype: object
0    Inception Action Thriller Science Fiction Myst...
1    The Dark Knight Drama Action Crime Thriller ch...
2    Interstellar Adventure Drama Science Fiction m...
3    Fight Club Drama edwardnorton bradpitt meatloa...
4    The Lord of the Rings: The Fellowship of the R...
Name: mixed_credits, dtype: object


In [347]:
# tf2 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
# tfidf_matrix2 = tf2.fit_transform(df['mixed_credits'])
# tfidf_matrix2.shape

In [348]:
cosine_sim_mixed = cal_similarity_matrix(df['mixed_credits'])

In [400]:
get_recommendation_by_title(df, 'The Dark Knight', cosine_sim_mixed)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id,description,cast,crew,keywords,director,mixed_credits
0,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588,Your mind is the scene of the crime.,"Cobb, a skilled thief who commits corporate es...",27205,"Cobb, a skilled thief who commits corporate es...","[leonardodicaprio, josephgordon-levitt, ellenp...","[{'credit_id': '56e8462cc3a368408400354c', 'de...","[lossoflov, dream, kidnap, sleep, subconsci, h...",christophernolan,Inception Action Thriller Science Fiction Myst...
2,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107,Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,157336,Interstellar chronicles the adventures of a gr...,"[matthewmcconaughey, jessicachastain, annehath...","[{'credit_id': '54cba75b925141678e014d1a', 'de...","[savingtheworld, artificialintellig, fatherson...",christophernolan,Interstellar Adventure Drama Science Fiction m...
20,The Prestige,2006,4510,8,16.94556,"[Drama, Mystery, Thriller]",7.758148,Are You Watching Closely?,A mysterious story of two magicians whose inte...,1124,A mysterious story of two magicians whose inte...,"[hughjackman, christianbale, michaelcaine]","[{'credit_id': '52fe42e8c3a36847f802bef9', 'de...","[competit, secret, obsess, magic, dyinganddeat...",christophernolan,The Prestige Drama Mystery Thriller hughjackma...
24,Memento,2000,4168,8,15.450789,"[Mystery, Thriller]",7.740175,Some memories are best forgotten.,Suffering short-term memory loss after a head ...,77,Suffering short-term memory loss after a head ...,"[guypearce, carrie-annemoss, joepantoliano]","[{'credit_id': '52fe4214c3a36847f80024cb', 'de...","[individu, insulin, tattoo, waitress, amnesia,...",christophernolan,Memento Mystery Thriller guypearce carrie-anne...
75,The Dark Knight Rises,2012,9263,7,20.58258,"[Action, Crime, Drama, Thriller]",6.921448,The Legend Ends,Following the death of District Attorney Harve...,49026,Following the death of District Attorney Harve...,"[christianbale, michaelcaine, garyoldman]","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","[dccomic, crimefight, terrorist, secretident, ...",christophernolan,The Dark Knight Rises Action Crime Drama Thril...
83,Batman Begins,2005,7511,7,28.505341,"[Action, Crime, Drama]",6.904127,Evil fears the knight.,"Driven by tragedy, billionaire Bruce Wayne ded...",272,"Driven by tragedy, billionaire Bruce Wayne ded...","[christianbale, michaelcaine, liamneeson]","[{'credit_id': '52fe4230c3a36847f800ac6d', 'de...","[himalaya, martialart, dccomic, crimefight, se...",christophernolan,Batman Begins Action Crime Drama christianbale...
252,Dunkirk,2017,2712,7,30.938854,"[Action, Drama, History, Thriller, War]",6.757878,The event that shaped our world,The miraculous evacuation of Allied soldiers f...,374720,The miraculous evacuation of Allied soldiers f...,"[fionnwhitehead, tomglynn-carney, jacklowden]","[{'credit_id': '598138b5925141519b008a5e', 'de...","[franc, beach, worldwarii, evacu, german, pilo...",christophernolan,Dunkirk Action Drama History Thriller War fion...
776,Batman: Under the Red Hood,2010,459,7,7.039325,"[Action, Animation]",6.147016,Dare to Look Beneath the Hood.,Batman faces his ultimate challenge as the mys...,40662,Batman faces his ultimate challenge as the mys...,"[brucegreenwood, jensenackles, neilpatrickharris]","[{'credit_id': '589f8b1ac3a3684fe40031cb', 'de...","[martialart, dccomic, vigilant, joker, superhe...",brandonvietti,Batman: Under the Red Hood Action Animation br...
1041,Batman Returns,1992,1706,6,15.001681,"[Action, Fantasy]",5.846862,"The Bat, the Cat, the Penguin.","Having defeated the Joker, Batman now faces th...",364,"Having defeated the Joker, Batman now faces th...","[michaelkeaton, dannydevito, michellepfeiffer]","[{'credit_id': '52fe423cc3a36847f800e513', 'de...","[holiday, corrupt, doublelif, dccomic, crimefi...",timburton,Batman Returns Action Fantasy michaelkeaton da...
1204,Insomnia,2002,1181,6,11.424974,"[Crime, Mystery, Thriller]",5.797081,A tough cop. A brilliant killer. An unspeakabl...,Two Los Angeles homicide detectives are dispat...,320,Two Los Angeles homicide detectives are dispat...,"[alpacino, robinwilliams, hilaryswank]","[{'credit_id': '52fe4237c3a36847f800ced5', 'de...","[detect, confess, fbi, homicid, blackmail, sus...",christophernolan,Insomnia Crime Mystery Thriller alpacino robin...


In [366]:
from surprise import Reader, Dataset, SVD, evaluate, NormalPredictor
from surprise.model_selection import cross_validate

In [352]:
svd = SVD()

In [387]:
ratings = pd.read_csv('./data/ratings_small.csv')

In [388]:
print(ratings.shape)
# ratings_small = ratings.sample(frac=0.1, random_state=1)


(100004, 4)


In [389]:
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [390]:
cross_validate(svd, data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.89851619, 0.89890566, 0.89223992, 0.8920031 , 0.89562752]),
 'test_mae': array([0.69141734, 0.6903456 , 0.68783327, 0.68717523, 0.69059357]),
 'fit_time': (6.602980852127075,
  6.42930006980896,
  6.5155792236328125,
  6.4863221645355225,
  6.680490016937256),
 'test_time': (0.2040572166442871,
  0.19883179664611816,
  0.20395994186401367,
  0.20009589195251465,
  0.20800995826721191)}

In [391]:
svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a3d2c8860>

In [396]:
print(ratings_small[ratings_small['userId'] == 1])
svd.predict(1, 302)
# help(svd.predict)

    userId  movieId  rating   timestamp
0        1       31     2.5  1260759144
1        1     1029     3.0  1260759179
2        1     1061     3.0  1260759182
3        1     1129     2.0  1260759185
4        1     1172     4.0  1260759205
5        1     1263     2.0  1260759151
6        1     1287     2.0  1260759187
7        1     1293     2.0  1260759148
8        1     1339     3.5  1260759125
9        1     1343     2.0  1260759131
10       1     1371     2.5  1260759135
11       1     1405     1.0  1260759203
12       1     1953     4.0  1260759191
13       1     2105     4.0  1260759139
14       1     2150     3.0  1260759194
15       1     2193     2.0  1260759198
16       1     2294     2.0  1260759108
17       1     2455     2.5  1260759113
18       1     2968     1.0  1260759200
19       1     3671     3.0  1260759117


Prediction(uid=1, iid=302, r_ui=None, est=2.6777265506516774, details={'was_impossible': False})

In [397]:
svd.predict(1, 1029)

Prediction(uid=1, iid=1029, r_ui=None, est=2.9411632209301097, details={'was_impossible': False})

In [401]:
def hybrid(df, u_id, movies):
    cosine_sim = cal_similarity_matrix(df['mixed_credits'])
    recommended_indices = set([])
    for movie in movies:
        rec_movies = get_recommendation_by_title(df, movie, cosine_sim, 10)
        for movie_id in rec_movies['id']:
            recommended_indices.add(movie_id)
    recommended = df[np.isin(df['id'], list(recommended_indices))]
    recommended['est'] = recommended['id'].apply(lambda x: SVDtuned.predict(u_id, x).est)
    recommended = recommended.sort_values('est', ascending=False)
    return recommended[['title', 'year', 'est', 'weighted_rating']]

In [402]:
hybrid(df, 1, ['Avatar'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,title,year,est,weighted_rating
166,Terminator 2: Judgment Day,1991,3.190379,6.838208
154,Alien,1979,3.044287,6.847596
159,Star Trek Into Darkness,2013,2.675936,6.844959
574,The Abyss,1989,2.675936,6.393539
1054,Alien³,1992,2.675936,5.843797
1216,True Lies,1994,2.675936,5.79153
2220,Alien: Covenant,2017,2.675936,5.034164
213,Aliens,1986,2.648211,6.795018
169,The Terminator,1984,2.501843,6.835908
82,Titanic,1997,2.356975,6.907153


In [406]:
hybrid(df, 300, ['Avatar'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,title,year,est,weighted_rating
154,Alien,1979,4.171326,6.847596
166,Terminator 2: Judgment Day,1991,4.164268,6.838208
213,Aliens,1986,4.071627,6.795018
159,Star Trek Into Darkness,2013,3.909307,6.844959
574,The Abyss,1989,3.909307,6.393539
1054,Alien³,1992,3.909307,5.843797
1216,True Lies,1994,3.909307,5.79153
2220,Alien: Covenant,2017,3.909307,5.034164
169,The Terminator,1984,3.828091,6.835908
82,Titanic,1997,3.804173,6.907153


In [407]:
hybrid(df, 2, ['Toy Story'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,title,year,est,weighted_rating
848,Cars,2006,3.93825,5.92594
1273,Big,1988,3.553033,5.774921
146,Toy Story 3,2010,3.459827,6.851922
182,Toy Story 2,1999,3.459827,6.824813
226,The Lego Movie,2014,3.459827,6.786095
283,Hugo,2011,3.459827,6.710485
935,A Bug's Life,1998,3.459827,5.8835
1332,Monster House,2006,3.459827,5.756527
103,"Monsters, Inc.",2001,3.334047,6.884308
2189,Cars 2,2011,3.16977,5.042143


Unnamed: 0,userId,movieId,rating


In [109]:
SVDtuned.predict(1, 1029)

Prediction(uid=1, iid=1029, r_ui=None, est=2.803786237121764, details={'was_impossible': False})

In [217]:
## hybrid
def hybrid(df, u_id, movies):  
    cosine_sim = cal_similarity_matrix(df['mixed_credits'])
    recommended_indices = set([])
    for movie in movies:
        rec_movies = get_recommendation_by_title(df, movie, cosine_sim, 20)
        for movie_id in rec_movies['id']:
            recommended_indices.add(movie_id)
    recommended = df[np.isin(df['id'], list(recommended_indices))]
    recommended['est'] = recommended['id'].apply(lambda x: SVDtuned.predict(u_id, x).est)
    recommended['true-rating'] = recommended['id'].\
        apply(lambda x: ratings[np.logical_and(ratings['userId'] == u_id, ratings['movieId'] == x)].rating)
    return recommended.sort_values('est', ascending=False)[['title','year', 'est', 'true-rating', 'weighted_rating']]

In [221]:
hybrid(df, 3, ['Toy Story']).head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


ValueError: Wrong number of items passed 0, placement implies 1

In [194]:
get_recommendation_by_title(df, 'Toy Story', cosine_sim_mixed)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id,description,cast,crew,keywords,mixed_credits
103,"Monsters, Inc.",2001,6150,7,26.419962,"[Animation, Comedy, Family]",6.884308,We Scare Because We Care.,"James Sullivan and Mike Wazowski are monsters,...",585,"James Sullivan and Mike Wazowski are monsters,...","[johngoodman, billycrystal, marygibbs]",[petedocter],"[monster, infant, energysuppli, compani, rival...",Animation Comedy Family johngoodman billycryst...
147,Toy Story 3,2010,4710,7,16.96647,"[Animation, Family, Comedy]",6.851922,No toy gets left behind.,"Woody, Buzz, and the rest of Andy's toys haven...",10193,"Woody, Buzz, and the rest of Andy's toys haven...","[tomhanks, timallen, nedbeatty]",[leeunkrich],"[hostag, colleg, toy, barbi, anim, escap, dayc...",Animation Family Comedy tomhanks timallen nedb...
183,Toy Story 2,1999,3914,7,17.547693,"[Animation, Comedy, Family]",6.824813,The toys are back!,"Andy heads off to Cowboy Camp, leaving his toy...",863,"Andy heads off to Cowboy Camp, leaving his toy...","[tomhanks, timallen, joancusack]",[johnlasseter],"[museum, prosecut, identitycrisi, airplan, fle...",Animation Comedy Family tomhanks timallen joan...
227,The Lego Movie,2014,3127,7,16.418133,"[Adventure, Animation, Comedy, Family, Fantasy]",6.786095,The story of a nobody who saved everybody.,"An ordinary Lego mini-figure, mistakenly thoug...",137106,"An ordinary Lego mini-figure, mistakenly thoug...","[chrispratt, willferrell, elizabethbanks]",[phillord],"[fathersonrelationship, creativ, friendship, p...",Adventure Animation Comedy Family Fantasy chri...
285,Hugo,2011,2197,7,14.046164,"[Adventure, Drama, Family]",6.710485,One of the most legendary directors of our tim...,Hugo is an orphan boy living in the walls of a...,44826,Hugo is an orphan boy living in the walls of a...,"[benkingsley, sachabaroncohen, asabutterfield]",[martinscorsese],"[librari, clock, filmdirector, key, toy, boy, ...",Adventure Drama Family benkingsley sachabaronc...
339,Hachi: A Dog's Tale,2009,1769,7,8.621359,"[Drama, Family]",6.654237,"A true story of faith, devotion and undying love.",A drama based on the true story of a college p...,28178,A drama based on the true story of a college p...,"[richardgere, joanallen, jasonalexander]",[lassehallström],"[japanes, loyalti, humananimalrelationship, fr...",Drama Family richardgere joanallen jasonalexan...
852,Cars,2006,3991,6,18.907948,"[Animation, Adventure, Comedy, Family]",5.92594,Ahhh... it's got that new movie smell.,"Lightning McQueen, a hotshot rookie race car d...",920,"Lightning McQueen, a hotshot rookie race car d...","[owenwilson, paulnewman, bonniehunt]",[johnlasseter],"[carrac, carjourney, auto, route66, wrecker, p...",Animation Adventure Comedy Family owenwilson p...
940,A Bug's Life,1998,2379,6,16.869209,"[Adventure, Animation, Comedy, Family]",5.8835,An epic presentation of miniature proportions.,"On behalf of ""oppressed bugs everywhere,"" an i...",9487,"On behalf of ""oppressed bugs everywhere,"" an i...","[kevinspacey, julialouis-dreyfus, haydenpanett...",[johnlasseter],"[winter, fight, ant, invent, collector, ant-hi...",Adventure Animation Comedy Family kevinspacey ...
1279,Big,1988,1022,6,9.562292,"[Fantasy, Drama, Comedy, Romance, Family]",5.774921,You're Only Young Once But For Josh It Might J...,"A young boy, Josh Baskin makes a wish at a car...",2280,"A young boy, Josh Baskin makes a wish at a car...","[tomhanks, elizabethperkins, robertloggia]",[pennymarshall],"[basebal, co-work, bronx, pinballmachin, toyma...",Fantasy Drama Comedy Romance Family tomhanks e...
2205,Cars 2,2011,2088,5,13.693002,"[Animation, Family, Adventure, Comedy]",5.042143,Ka-ciao!,Star race car Lightning McQueen and his pal Ma...,49013,Star race car Lightning McQueen and his pal Ma...,"[owenwilson, larrythecableguy, michaelcaine]",[johnlasseter],"[carrac, sequel, comedi, anthropomorph, bestfr...",Animation Family Adventure Comedy owenwilson l...
