In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline


In [3]:
# read movie metadata
movie_md = pd.read_csv('./data/movies_metadata.csv', low_memory=False)


In [4]:
df = movie_md.copy()
df.shape


(45466, 24)

In [5]:
df.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [7]:
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [8]:
from ast import literal_eval
def process_genres(df):
    '''
    process genres column using liter_eval to convert string to python object
    extract the genres info
    '''
    df['genres'] = df['genres'].fillna('[]').apply(literal_eval)
    # extract the 'name' value of genres
    df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x])
    
print(df['genres'][0])
process_genres(df)
print(df['genres'][0])


[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
['Animation', 'Comedy', 'Family']


In [10]:
# process release_date
df['year'] = df['release_date'].apply(lambda x: str(x)[:4] if x != 'NaN' else np.nan)
df['year'].head(1)

0    1995
Name: year, dtype: object

In [12]:
def process_vote(df, vote_count_cutoff_percentile=0.95):
    '''
    calculate weighted rating instead of row rating
    weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

    Where: 
    R = average for the movie (mean) = (rating)
    v = number of votes for the movie = (votes)
    m = minimum votes required to be listed in the Top Rated list (default: 95 percentile of vote_count)
    C = the mean vote across the whole report
    '''
    df = df.dropna(subset=['vote_count', 'vote_average'])
    df.vote_count = df.vote_count.astype('int')
    df.vote_average = df.vote_average.astype('int')
    
    mean_vote_average = df.vote_average.mean()
    vote_count_cutoff = df.vote_count.quantile(vote_count_cutoff_percentile)
    df = df.loc[df['vote_count'] >= vote_count_cutoff]
    df['weighted_rating'] = (df.vote_average * df.vote_count/(df.vote_count + vote_count_cutoff)) + \
                            (mean_vote_average * vote_count_cutoff/(df.vote_count + vote_count_cutoff))
    df = df.sort_values('weighted_rating', ascending=False)
    return df

In [13]:
df = process_vote(df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [14]:
FEATURES = ['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres', 'weighted_rating', 'tagline', 'overview', 'id']
df = df[FEATURES]
df.shape

(2274, 10)

In [15]:
def get_tops_by_genres(df, *genres, intersect=True, top=10):
    if not genres:
        return df.head(top)
    elif not intersect:
        return df[df['genres'].apply(lambda x: not set(genres).isdisjoint(x))].head(top)
    else:
        return df[df['genres'].apply(lambda x: set(genres).issubset(x))].head(top)

In [16]:
get_tops_by_genres(df, 'Family')

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
1225,Back to the Future,1985,6239,8,25.778509,"[Adventure, Comedy, Science Fiction, Family]",7.820813,He's the only kid ever to get into trouble bef...,Eighties teenager Marty McFly is accidentally ...,105
359,The Lion King,1994,5520,8,21.605761,"[Family, Animation, Drama]",7.799175,Life's greatest adventure is finding your plac...,A young lion cub named Simba can't wait to be ...,8587
5481,Spirited Away,2001,3968,8,41.048867,"[Fantasy, Adventure, Animation, Family]",7.72837,The tunnel led Chihiro to a mysterious town...,A ten year old girl who wanders away from her ...,129
5833,My Neighbor Totoro,1988,1730,8,13.507299,"[Fantasy, Animation, Family]",7.447452,These strange creatures still exist in Japan. ...,Two sisters move to the country with their fat...,8392
926,It's a Wonderful Life,1946,1103,8,15.031588,"[Drama, Family, Fantasy]",7.222046,It's a wonderful laugh! It's a wonderful love!,George Bailey has spent his entire life giving...,1585
19901,Paperman,2012,734,8,7.198633,"[Animation, Family, Romance]",6.976272,"Delicate, charming and sweet.",An urban office worker finds that paper airpla...,140420
4766,Harry Potter and the Philosopher's Stone,2001,7188,7,38.187238,"[Adventure, Fantasy, Family]",6.900064,Let the Magic Begin.,Harry Potter has lived under the stairs at his...,671
13724,Up,2009,7048,7,19.330884,"[Animation, Comedy, Family, Adventure]",6.898194,,Carl Fredricksen spent his entire life dreamin...,14160
30315,Inside Out,2015,6737,7,23.985587,"[Drama, Comedy, Animation, Family]",6.893778,Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",150540
15472,Despicable Me,2010,6595,7,22.274502,"[Animation, Family]",6.891633,Superbad. Superdad.,Villainous Gru lives up to his reputation as a...,20352


In [17]:
get_tops_by_genres(df, 'Family', 'Animation')

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
359,The Lion King,1994,5520,8,21.605761,"[Family, Animation, Drama]",7.799175,Life's greatest adventure is finding your plac...,A young lion cub named Simba can't wait to be ...,8587
5481,Spirited Away,2001,3968,8,41.048867,"[Fantasy, Adventure, Animation, Family]",7.72837,The tunnel led Chihiro to a mysterious town...,A ten year old girl who wanders away from her ...,129
5833,My Neighbor Totoro,1988,1730,8,13.507299,"[Fantasy, Animation, Family]",7.447452,These strange creatures still exist in Japan. ...,Two sisters move to the country with their fat...,8392
19901,Paperman,2012,734,8,7.198633,"[Animation, Family, Romance]",6.976272,"Delicate, charming and sweet.",An urban office worker finds that paper airpla...,140420
13724,Up,2009,7048,7,19.330884,"[Animation, Comedy, Family, Adventure]",6.898194,,Carl Fredricksen spent his entire life dreamin...,14160
30315,Inside Out,2015,6737,7,23.985587,"[Drama, Comedy, Animation, Family]",6.893778,Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",150540
15472,Despicable Me,2010,6595,7,22.274502,"[Animation, Family]",6.891633,Superbad. Superdad.,Villainous Gru lives up to his reputation as a...,20352
12704,WALL·E,2008,6439,7,16.088366,"[Animation, Family]",6.889173,An adventure beyond the ordinar-E.,WALL·E is the last robot left on an Earth that...,10681
6232,Finding Nemo,2003,6292,7,25.497794,"[Animation, Family]",6.886751,There are 3.7 trillion fish in the ocean. They...,"Nemo, an adventurous young clownfish, is unexp...",12
24455,Big Hero 6,2014,6289,7,213.849907,"[Adventure, Family, Animation, Action, Comedy]",6.8867,From the creators of Wreck-it Ralph and Frozen,The special bond that develops between plus-si...,177572


In [18]:
get_tops_by_genres(df, 'Family', 'Animation', intersect=True)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
359,The Lion King,1994,5520,8,21.605761,"[Family, Animation, Drama]",7.799175,Life's greatest adventure is finding your plac...,A young lion cub named Simba can't wait to be ...,8587
5481,Spirited Away,2001,3968,8,41.048867,"[Fantasy, Adventure, Animation, Family]",7.72837,The tunnel led Chihiro to a mysterious town...,A ten year old girl who wanders away from her ...,129
5833,My Neighbor Totoro,1988,1730,8,13.507299,"[Fantasy, Animation, Family]",7.447452,These strange creatures still exist in Japan. ...,Two sisters move to the country with their fat...,8392
19901,Paperman,2012,734,8,7.198633,"[Animation, Family, Romance]",6.976272,"Delicate, charming and sweet.",An urban office worker finds that paper airpla...,140420
13724,Up,2009,7048,7,19.330884,"[Animation, Comedy, Family, Adventure]",6.898194,,Carl Fredricksen spent his entire life dreamin...,14160
30315,Inside Out,2015,6737,7,23.985587,"[Drama, Comedy, Animation, Family]",6.893778,Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",150540
15472,Despicable Me,2010,6595,7,22.274502,"[Animation, Family]",6.891633,Superbad. Superdad.,Villainous Gru lives up to his reputation as a...,20352
12704,WALL·E,2008,6439,7,16.088366,"[Animation, Family]",6.889173,An adventure beyond the ordinar-E.,WALL·E is the last robot left on an Earth that...,10681
6232,Finding Nemo,2003,6292,7,25.497794,"[Animation, Family]",6.886751,There are 3.7 trillion fish in the ocean. They...,"Nemo, an adventurous young clownfish, is unexp...",12
24455,Big Hero 6,2014,6289,7,213.849907,"[Adventure, Family, Animation, Action, Comedy]",6.8867,From the creators of Wreck-it Ralph and Frozen,The special bond that develops between plus-si...,177572


In [19]:
def get_tops_by_year(df, year, top=10):
    return df[df.year == str(year)].head(top)

In [20]:
get_tops_by_year(df,2013)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id
22131,The Wolf of Wall Street,2013,6768,7,16.382422,"[Crime, Drama, Comedy]",6.894236,EARN. SPEND. PARTY.,A New York stockbroker refuses to cooperate in...,106646
22058,The Hunger Games: Catching Fire,2013,6656,7,25.309139,"[Adventure, Action, Science Fiction]",6.892565,Every revolution begins with a spark.,Katniss Everdeen has returned home safe after ...,101299
21592,Gravity,2013,5879,7,18.50194,"[Science Fiction, Thriller, Drama]",6.879342,Don't Let Go,"Dr. Ryan Stone, a brilliant medical engineer o...",49047
21025,Now You See Me,2013,5635,7,17.852022,"[Thriller, Crime]",6.874491,4 amazing magicians. 3 impossible heists. 1 bi...,An FBI agent and an Interpol detective track a...,75656
22110,Frozen,2013,5440,7,24.248243,"[Animation, Adventure, Family]",6.870324,Only the act of true love will thaw a frozen h...,Young princess Anna of Arendelle dreams about ...,109445
21161,Despicable Me 2,2013,4729,7,24.82355,"[Animation, Comedy, Family]",6.852467,Back 2 Work,Gru is recruited by the Anti-Villain League to...,93456
22059,The Hobbit: The Desolation of Smaug,2013,4633,7,20.644776,"[Adventure, Fantasy]",6.849671,Beyond darkness... beyond desolation... lies t...,"The Dwarves, Bilbo and Gandalf have successful...",57158
20922,Star Trek Into Darkness,2013,4479,7,15.78129,"[Action, Adventure, Science Fiction]",6.844959,Earth Will Fall,When the crew of the Enterprise is called back...,54138
22168,Her,2013,4215,7,13.829515,"[Romance, Science Fiction, Drama]",6.836155,A Spike Jonze Love Story,"In the not so distant future, Theodore, a lone...",152601
20910,The Great Gatsby,2013,3885,7,17.598936,"[Drama, Romance]",6.823636,Reserving judgments is a matter of infinite ho...,An adaptation of F. Scott Fitzgerald's Long Is...,64682


In [21]:
df.columns

Index(['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres',
       'weighted_rating', 'tagline', 'overview', 'id'],
      dtype='object')

In [287]:
# Content-based recommendation
df.tagline.fillna('', inplace=True)
df.overview.fillna('', inplace=True)

In [288]:
df['description'] = df.overview + df.tagline
df['description'].head(1).values

array(['Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person\'s idea into a target\'s subconscious.Your mind is the scene of the crime.'],
      dtype=object)

In [289]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['description'])
tfidf_matrix.shape

(2274, 75685)

In [290]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim_description = linear_kernel(tfidf_matrix, tfidf_matrix)

In [291]:
title_to_idx = {title: idx for title, idx in zip(df.title, range(df.shape[0]))}
title_to_idx['Inception']

0

In [318]:
def get_recommendation_by_title(df, title, cosine_sim, top=10):
    idx = title_to_idx[title]
    scores = sorted(list(enumerate(cosine_sim[idx])), key=lambda x: x[1], reverse=True)
    movie_indices = list(map(lambda x: x[0], scores[1:top+1]))
    return df.iloc[movie_indices].sort_values('weighted_rating', ascending=False)
    

In [319]:
get_recommendation_by_title(df, 'Toy Story', cosine_sim_description)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id,description,cast,crew,keywords,mixed_credits
146,Despicable Me 2,2013,4729,7,24.82355,"[Animation, Comedy, Family]",6.852467,Back 2 Work,Gru is recruited by the Anti-Villain League to...,93456,Gru is recruited by the Anti-Villain League to...,"[stevecarell, kristenwiig, benjaminbratt]",[pierrecoffin],"[secretag, bakeri, fallinginlov, fatherdaughte...",Animation Comedy Family stevecarell kristenwii...
182,Catch Me If You Can,2002,3917,7,19.833077,"[Drama, Crime]",6.824933,The true story of a real fake.,"A true story about Frank Abagnale Jr. who, bef...",640,"A true story about Frank Abagnale Jr. who, bef...","[leonardodicaprio, tomhanks, christopherwalken]",[stevenspielberg],"[conman, biographi, fbiagent, overheadcamerash...",Drama Crime leonardodicaprio tomhanks christop...
218,The Secret Life of Walter Mitty,2013,3213,7,8.684892,"[Adventure, Comedy, Drama, Fantasy]",6.791139,"Stop Dreaming, Start Living",A timid magazine photo manager who lives life ...,116745,A timid magazine photo manager who lives life ...,"[benstiller, kristenwiig, pattonoswalt]",[benstiller],"[himalaya, photograph, magazin, iceland, daydr...",Adventure Comedy Drama Fantasy benstiller kris...
458,The Bucket List,2007,1138,7,9.070643,"[Drama, Comedy]",6.515449,Find the joy.,Corporate billionaire Edward Cole and working ...,7350,Corporate billionaire Edward Cole and working ...,"[jacknicholson, morganfreeman, seanhayes]",[robreiner],"[africa, himalaya, braintumor, wifehusbandrela...",Drama Comedy jacknicholson morganfreeman seanh...
688,John Q,2002,604,7,8.867562,"[Drama, Thriller, Crime]",6.266171,Give a father no options and you leave him no ...,John Quincy Archibald is a father and husband ...,8470,John Quincy Archibald is a father and husband ...,"[denzelwashington, robertduvall, anneheche]",[nickcassavetes],"[fathersonrelationship, chicago, heartattack, ...",Drama Thriller Crime denzelwashington robertdu...
783,Once,2007,457,7,10.357064,"[Drama, Music, Romance]",6.145101,How often do you find the right person?,A vacuum repairman moonlights as a street musi...,5723,A vacuum repairman moonlights as a street musi...,"[glenhansard, markétairglová, hughwalsh]",[johncarney],"[rockandrol, pop, musicstyl, loveofone'slif, f...",Drama Music Romance glenhansard markétairglová...
977,Percy Jackson & the Olympians: The Lightning T...,2010,2079,6,9.785895,"[Adventure, Fantasy, Family]",5.869592,Worlds Collide,"Accident prone teenager, Percy discovers he's ...",32657,"Accident prone teenager, Percy discovers he's ...","[loganlerman, brandont.jackson, alexandradadda...",[chriscolumbus],"[monster, greekmytholog, god, poseidon , light...",Adventure Fantasy Family loganlerman brandont....
1639,You Only Live Twice,1967,541,6,10.359669,"[Action, Thriller, Adventure]",5.663882,You Only Live Twice...and Twice is the only wa...,A mysterious space craft kidnaps a Russian and...,667,A mysterious space craft kidnaps a Russian and...,"[seanconnery, akikowakabayashi, karindor]",[lewisgilbert],"[londonengland, japan, england, assassin, heli...",Action Thriller Adventure seanconnery akikowak...
1774,Nanny McPhee and the Big Bang,2010,450,6,8.983275,"[Comedy, Fantasy]",5.629282,You'll Believe That Pigs Can Fly!,Nanny McPhee appears at the door of a harried ...,35019,Nanny McPhee appears at the door of a harried ...,"[emmathompson, asabutterfield, ralphfiennes]",[susannawhite],"[nanni, fantasi, children, aftercreditssting, ...",Comedy Fantasy emmathompson asabutterfield ral...
1843,Garfield: A Tail of Two Kitties,2006,477,5,7.217377,"[Animation, Comedy, Family]",5.116669,The Ego has landed.,Garfield is back and this time Garfield and hi...,9513,Garfield is back and this time Garfield and hi...,"[billmurray, jenniferlovehewitt, billyconnolly]",[timhill],"[londonengland, cat, mistakeinperson, luxuri, ...",Animation Comedy Family billmurray jenniferlov...


In [320]:
get_recommendation_by_title(df, 'The Dark Knight', cosine_sim_description)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id,description,cast,crew,keywords,mixed_credits
75,The Dark Knight Rises,2012,9263,7,20.58258,"[Action, Crime, Drama, Thriller]",6.921448,The Legend Ends,Following the death of District Attorney Harve...,49026,Following the death of District Attorney Harve...,"[christianbale, michaelcaine, garyoldman]",[christophernolan],"[dccomic, crimefight, terrorist, secretident, ...",Action Crime Drama Thriller christianbale mich...
175,Die Hard,1988,4005,7,16.640522,"[Action, Thriller]",6.828404,40 Stories. Twelve Terrorists. One Cop.,"NYPD cop, John McClane's plan to reconcile wit...",562,"NYPD cop, John McClane's plan to reconcile wit...","[brucewillis, alanrickman, alexandergodunov]",[johnmctiernan],"[helicopt, journalist, basedonnovel, terrorist...",Action Thriller brucewillis alanrickman alexan...
288,Hidden Figures,2016,2178,7,16.816834,[Drama],6.708379,"Meet the women you don't know, behind the miss...","The untold story of Katherine G. Johnson, Doro...",381284,"The untold story of Katherine G. Johnson, Doro...","[tarajip.henson, octaviaspencer, janellemonae]",[theodoremelfi],"[nasa, sexism, biographi, mathemat, racialsegr...",Drama tarajip.henson octaviaspencer janellemon...
382,The Aviator,2004,1526,7,9.86738,[Drama],6.61137,"For some men, the sky was the limit. For him, ...",A biopic depicting the life of filmmaker and a...,2567,A biopic depicting the life of filmmaker and a...,"[leonardodicaprio, cateblanchett, katebeckinsale]",[martinscorsese],"[ladykil, pilot, biographi, woman, aviat, phob...",Drama leonardodicaprio cateblanchett katebecki...
393,Collateral,2004,1476,7,13.455112,"[Drama, Crime, Thriller]",6.601196,It started like any other night.,Cab driver Max picks up a man who offers him $...,1538,Cab driver Max picks up a man who offers him $...,"[tomcruise, jamiefoxx, jadapinkettsmith]",[michaelmann],"[california, taxi, assassin, hostag, taxidriv,...",Drama Crime Thriller tomcruise jamiefoxx jadap...
740,The Hundred-Foot Journey,2014,516,7,14.466789,[Drama],6.198195,Life's greatest journey begins with the first ...,A story centered around an Indian family who m...,228194,A story centered around an Indian family who m...,"[helenmirren, manishdayal, ompuri]",[lassehallström],"[franc, basedonnovel, indianlead, restaur, fam...",Drama helenmirren manishdayal ompuri lassehall...
776,The Count of Monte Cristo,2002,463,7,10.765924,"[Action, Adventure, Drama, Thriller]",6.15082,Prepare for adventure. Count on revenge.,Edmond Dantés's life and plans to marry the be...,11362,Edmond Dantés's life and plans to marry the be...,"[jimcaviezel, guypearce, richardharris]",[kevinreynolds],"[lossoflov, lover(female), ex-lov, tortur, nap...",Action Adventure Drama Thriller jimcaviezel gu...
1041,The Transporter,2002,1724,6,13.217421,"[Action, Crime, Thriller]",5.84814,Rules are made to be broken.,"Former Special Forces officer, Frank Martin wi...",4108,"Former Special Forces officer, Frank Martin wi...","[jasonstatham, shuqi, françoisberléand]",[louisleterrier],"[carjourney, transport, auto, humantraffick]",Action Crime Thriller jasonstatham shuqi franç...
1724,Star Trek IV: The Voyage Home,1986,490,6,12.596956,"[Science Fiction, Adventure]",5.64533,The key to saving the future can only be found...,Fugitives of the Federation for their daring r...,168,Fugitives of the Federation for their daring r...,"[williamshatner, leonardnimoy, deforestkelley]",[leonardnimoy],"[savingtheworld, sanfrancisco, ussenterprise-a...",Science Fiction Adventure williamshatner leona...
2158,Dirty Grandpa,2016,1429,5,16.290586,[Comedy],5.057051,This is Jason. He's a little worried about his...,Jason Kelly is one week away from marrying his...,291870,Jason Kelly is one week away from marrying his...,"[zacefron, robertdeniro, juliannehough]",[danmazer],"[grandfathergrandsonrelationship, grandfath, r...",Comedy zacefron robertdeniro juliannehough dan...


In [22]:
# add more features to calculate similarity between movies
credits = pd.read_csv('./data/credits.csv')
keywords = pd.read_csv('./data/keywords.csv')

In [23]:
df['id'] = df['id'].astype(int)
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')

In [24]:
# extract names of the top 3 actors from the cast column
df['cast'] = df['cast'].fillna('[]').apply(lambda x: [str.lower(i['name'].replace(" ","")) for i in literal_eval(x)][:3])

In [25]:
def get_director(x):
    '''
    extract director's name from crew column of x
    '''
    for i in x:
        if i['job'] == 'Director':
            return [str.lower(i['name'].replace(' ', ''))]
    return np.nan


In [26]:
df['crew'] = df['crew'].fillna('[]').apply(lambda x: get_director(literal_eval(x)))

In [27]:
# process keywords
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: [str.lower(stemmer.stem(i['name'].replace(' ', ''))) for i in literal_eval(x)])

In [30]:
df.head()[['cast', 'crew', 'keywords']]

Unnamed: 0,cast,crew,keywords
0,"[leonardodicaprio, josephgordon-levitt, ellenp...",[christophernolan],"[lossoflov, dream, kidnap, sleep, subconsci, h..."
1,"[christianbale, michaelcaine, heathledger]",[christophernolan],"[dccomic, crimefight, secretident, scarecrow, ..."
2,"[matthewmcconaughey, jessicachastain, annehath...",[christophernolan],"[savingtheworld, artificialintellig, fatherson..."
3,"[edwardnorton, bradpitt, meatloaf]",[davidfincher],"[supportgroup, dualident, nihil, rageandh, ins..."
4,"[elijahwood, ianmckellen, cateblanchett]",[peterjackson],"[elv, dwarv, orc, middle-earth(tolkien), hobbi..."


In [311]:
# use genres, top 3 actors, director (* 3 to make director a more significant factor), 
# and keyword to calculate similarity
df['mixed_credits'] = df['genres'] + df['cast'] + df['crew'] * 3 + df['keywords']
df.head(1)
df['mixed_credits'] = df['mixed_credits'].apply(lambda x: ' '.join(x))

In [312]:
tf2 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix2 = tf2.fit_transform(df['mixed_credits'])
tfidf_matrix2.shape

(2291, 41948)

In [313]:
cosine_sim_mixed = cosine_similarity(tfidf_matrix2, tfidf_matrix2)

In [321]:
get_recommendation_by_title(df, 'The Dark Knight', cosine_sim_mixed)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating,tagline,overview,id,description,cast,crew,keywords,mixed_credits
0,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588,Your mind is the scene of the crime.,"Cobb, a skilled thief who commits corporate es...",27205,"Cobb, a skilled thief who commits corporate es...","[leonardodicaprio, josephgordon-levitt, ellenp...",[christophernolan],"[lossoflov, dream, kidnap, sleep, subconsci, h...",Action Thriller Science Fiction Mystery Advent...
2,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107,Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,157336,Interstellar chronicles the adventures of a gr...,"[matthewmcconaughey, jessicachastain, annehath...",[christophernolan],"[savingtheworld, artificialintellig, fatherson...",Adventure Drama Science Fiction matthewmcconau...
20,The Prestige,2006,4510,8,16.94556,"[Drama, Mystery, Thriller]",7.758148,Are You Watching Closely?,A mysterious story of two magicians whose inte...,1124,A mysterious story of two magicians whose inte...,"[hughjackman, christianbale, michaelcaine]",[christophernolan],"[competit, secret, obsess, magic, dyinganddeat...",Drama Mystery Thriller hughjackman christianba...
24,Memento,2000,4168,8,15.450789,"[Mystery, Thriller]",7.740175,Some memories are best forgotten.,Suffering short-term memory loss after a head ...,77,Suffering short-term memory loss after a head ...,"[guypearce, carrie-annemoss, joepantoliano]",[christophernolan],"[individu, insulin, tattoo, waitress, amnesia,...",Mystery Thriller guypearce carrie-annemoss joe...
75,The Dark Knight Rises,2012,9263,7,20.58258,"[Action, Crime, Drama, Thriller]",6.921448,The Legend Ends,Following the death of District Attorney Harve...,49026,Following the death of District Attorney Harve...,"[christianbale, michaelcaine, garyoldman]",[christophernolan],"[dccomic, crimefight, terrorist, secretident, ...",Action Crime Drama Thriller christianbale mich...
83,Batman Begins,2005,7511,7,28.505341,"[Action, Crime, Drama]",6.904127,Evil fears the knight.,"Driven by tragedy, billionaire Bruce Wayne ded...",272,"Driven by tragedy, billionaire Bruce Wayne ded...","[christianbale, michaelcaine, liamneeson]",[christophernolan],"[himalaya, martialart, dccomic, crimefight, se...",Action Crime Drama christianbale michaelcaine ...
253,Dunkirk,2017,2712,7,30.938854,"[Action, Drama, History, Thriller, War]",6.757878,The event that shaped our world,The miraculous evacuation of Allied soldiers f...,374720,The miraculous evacuation of Allied soldiers f...,"[fionnwhitehead, tomglynn-carney, jacklowden]",[christophernolan],"[franc, beach, worldwarii, evacu, german, pilo...",Action Drama History Thriller War fionnwhitehe...
780,Batman: Under the Red Hood,2010,459,7,7.039325,"[Action, Animation]",6.147016,Dare to Look Beneath the Hood.,Batman faces his ultimate challenge as the mys...,40662,Batman faces his ultimate challenge as the mys...,"[brucegreenwood, jensenackles, neilpatrickharris]",[brandonvietti],"[martialart, dccomic, vigilant, joker, superhe...",Action Animation brucegreenwood jensenackles n...
1046,Batman Returns,1992,1706,6,15.001681,"[Action, Fantasy]",5.846862,"The Bat, the Cat, the Penguin.","Having defeated the Joker, Batman now faces th...",364,"Having defeated the Joker, Batman now faces th...","[michaelkeaton, dannydevito, michellepfeiffer]",[timburton],"[holiday, corrupt, doublelif, dccomic, crimefi...",Action Fantasy michaelkeaton dannydevito miche...
1210,Insomnia,2002,1181,6,11.424974,"[Crime, Mystery, Thriller]",5.797081,A tough cop. A brilliant killer. An unspeakabl...,Two Los Angeles homicide detectives are dispat...,320,Two Los Angeles homicide detectives are dispat...,"[alpacino, robinwilliams, hilaryswank]",[christophernolan],"[detect, confess, fbi, homicid, blackmail, sus...",Crime Mystery Thriller alpacino robinwilliams ...
