In [73]:
import os
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
import tensorflow_hub as hub

DATA_PATH = './data'

In [74]:
os.listdir(DATA_PATH)

['glove.twitter.27B.100d.txt',
 'keywords.csv',
 'links.csv',
 'glove.6B.50d.txt',
 'clean.csv',
 'ratings.csv',
 'glove.twitter.27B.50d.txt',
 'glove.twitter.27B.25d.txt',
 'glove.twitter.27B.200d.txt',
 'crawl-300d-2M.vec',
 'credits.csv',
 'ratings.csv.bin',
 'ratings_small.csv',
 'links_small.csv',
 'movies_metadata.csv']

In [62]:
df = pd.read_csv(os.path.join(DATA_PATH, 'clean.csv'))

In [63]:
df.columns

Index(['id', 'budget', 'original_language', 'popularity', 'revenue', 'runtime',
       'title', 'vote_average', 'vote_count', 'year', 'month', 'day_of_week',
       'success', 'country_name', 'franchise_name', 'companie_name',
       'crew_Director', 'crew_Screenplay', 'crew_Producer',
       'crew_Executive Producer', 'crew_Music', 'crew_Director of Photography',
       'cast_1', 'cast_gender_1', 'cast_2', 'cast_gender_2', 'cast_3',
       'cast_gender_3', 'genres_1', 'genres_2', 'genres_3', 'cast_count',
       'crew_count', 'country_count', 'company_count', 'description',
       'keywords'],
      dtype='object')

#### To work with vote_count and vote_average features we will use weighted rating
### $ rating = \frac{vote\_count}{vote\_count\  +\  m}\  vote\_average + \frac{m}{vote\_count\  +\  m}\  M$
#### where M is mean of vote_average for all movies and m is parameter which is control how we relieve on users scores 
* #### If m equal vote_count when rating is average between M and vote_average
* #### If m is big comapred to vote_count when rating is almost M
* #### If m is small comapred to vote_count when rating is almost vote_average

In [64]:
# mean user's score of all movies
M = df['vote_average'].mean()

# take 85 percentile of vote_counts as a reliable amount of votes
m = df['vote_count'].quantile(0.75)

In [65]:
def weighted_rating(x):
    v = x['vote_count']
    return (v / (v+m) * x['vote_average']) + (m / (m+v) * M)

In [66]:
df['wr'] = df.apply(weighted_rating, axis=1)
df = df.sort_values('wr', ascending=False)

In [67]:
# there is no movies with high score and small number of votes, as expected 
df[['title', 'wr', 'vote_average', 'vote_count']].head()

Unnamed: 0,title,wr,vote_average,vote_count
10275,Dilwale Dulhania Le Jayenge,8.926098,9.1,661.0
311,The Shawshank Redemption,8.488081,8.5,8358.0
818,The Godfather,8.48349,8.5,6024.0
40111,Your Name.,8.40607,8.5,1030.0
12444,The Dark Knight,8.292439,8.3,12269.0


In [70]:
# remove bad movies
df = df.loc[df['wr'] >= 5.75]
# scale from 0 to 1
df['wr'] = (df['wr'] - df['wr'].mean()) / (df['wr'].max() - df['wr'].min())
# number of movie left
df.shape

In [13]:
# drop na
df = df.loc[df['crew_Director'].notna() \
            & df['cast_1'].notna() & df['cast_2'].notna() & df['cast_3'].notna() \
            & df['cast_gender_1'].notna() & df['cast_gender_2'].notna() & df['cast_gender_3'].notna()
            & df['genres_1'].notna()]

# add key words from keywords.csv file
df['description'] = df['description'] + df['keywords'].fillna('')

In [14]:
# create Series
df = df.reset_index()
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

In [15]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.85)
tfidf_matrix = tfidf.fit_transform(df['description'])
delattr(tfidf, 'stop_words_')
# tf-idf dict
dictionary = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

cosine_sim_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)
del tfidf_matrix, tfidf
gc.collect()

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.85)
count_matrix = vectorizer.fit_transform(df['description'])
delattr(vectorizer, 'stop_words_')

cosine_sim_count = cosine_similarity(count_matrix, count_matrix)
del count_matrix, vectorizer
gc.collect()

In [17]:
[file for file in os.listdir('data') if file.startswith('glove') or file.startswith('crawl')]

['glove.twitter.27B.100d.txt',
 'glove.6B.50d.txt',
 'glove.twitter.27B.50d.txt',
 'glove.twitter.27B.25d.txt',
 'glove.twitter.27B.200d.txt',
 'crawl-300d-2M.vec']

In [18]:
emb_name = 'glove.twitter.27B.50d.txt'

# all embedding
embeddings_index = {}
# embedding multiplied by tf-idf weight
embeddings_index_w = {}
with open(os.path.join(DATA_PATH, emb_name), encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
        
        weight = dictionary.get(word)
        if weight:
            embeddings_index_w[word] = np.asarray(values[1:], dtype='float32') * weight

In [19]:
def find_mean_embbeding(x, emb):
    mean_emb = []
    for word in x.split(' '):
        e = emb.get(word)
        if e is not None:
            mean_emb.append(e)
    return sum(mean_emb) / len(mean_emb) if len(mean_emb) else embeddings_index['none']

In [20]:
df['mean_embbeding'] = df['description'].apply(find_mean_embbeding, emb=embeddings_index)
df['mean_embbeding_w'] = df['description'].apply(find_mean_embbeding, emb=embeddings_index_w)

del embeddings_index, embeddings_index_w
gc.collect()

In [21]:
embed_matrix = np.zeros(shape=(len(df['mean_embbeding']), df['mean_embbeding'].iloc[0].shape[0]))
for i, vector in enumerate(df['mean_embbeding']):
    embed_matrix[i, :] = vector.T
    
cosine_sim_emb = cosine_similarity(embed_matrix, embed_matrix)
del embed_matrix
gc.collect()

embed_matrix = np.zeros(shape=(len(df['mean_embbeding_w']), df['mean_embbeding_w'].iloc[0].shape[0]))
for i, vector in enumerate(df['mean_embbeding_w']):
    embed_matrix[i, :] = vector.T
    
cosine_sim_emb_w = cosine_similarity(embed_matrix, embed_matrix)
del embed_matrix
gc.collect()

In [72]:
# universalsentence encoder embeddings DAN 512 
tf.logging.set_verbosity(tf.logging.ERROR)
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
embeddings = embed(df['description'].values)

In [23]:
init = tf.global_variables_initializer()
table_init = tf.tables_initializer()

with tf.Session() as sess:
    sess.run([init, table_init])
    embeddings_dan = sess.run(embeddings)

cosine_sim_emb_dan = cosine_similarity(embeddings_dan, embeddings_dan)
del embeddings_dan
gc.collect()

CPU times: user 40.4 s, sys: 4.62 s, total: 45 s
Wall time: 44 s


In [27]:
def recom_based_on_description(title=None, index=None):
    if title:
        idx = indices[title]
    else:
        idx = index

    result = None 
    l = ['tfidf', 'count', 'emb', 'emb_w', 'emb_dan']
    for i, sim in enumerate([cosine_sim_tfidf, cosine_sim_count, cosine_sim_emb, cosine_sim_emb_w, 
                             cosine_sim_emb_dan]):
        sim_scores = list(enumerate(sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:]
        movie_indices = [i[0] for i in sim_scores]
        movie_scores = [i[1] for i in sim_scores]

        if result is None:
            result = df.loc[movie_indices].copy()
            result['sim_%s' % l[i]] = movie_scores
        else:
            result2 = df.loc[movie_indices, ['index']].copy()
            result2['sim_%s' % l[i]] = movie_scores
            result = result.merge(result2, on='index')
    del result2
    
    return result

def show_top_n(result, n=5):
    l = ['tfidf', 'count', 'emb', 'emb_w', 'emb_dan']
    return pd.concat([result[['title', 'sim_%s' % col]]\
                      .sort_values(by='sim_%s' % col, ascending=False)\
                      .head(n).reset_index().rename(columns={'title': 'title_%s' % col})['title_%s' % col] 
                      for col in l], axis=1)

In [28]:
result = recom_based_on_description('Pulp Fiction')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,Vares: Private Eye,How to Make Money Selling Drugs,Pink Flamingos,Kill!,Tango & Cash
1,Shanghai Kiss,Wrong Turn at Tahoe,Kill!,Pink Flamingos,Le Cercle Rouge
2,Artists and Models,Chiko,The Maltese Falcon,The Maltese Falcon,The Outsider
3,The Confession,Go,Mystery Team,Mr. & Mrs. Smith,Dillinger
4,On the Outs,The French Connection,Cast a Deadly Spell,Cast a Deadly Spell,Bugsy


In [29]:
result = recom_based_on_description('Alien')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,Moonraker,The War in Space,The Mysterians,The Mysterians,Forbidden Planet
1,Star Wreck: In the Pirkinning,The War in Space,It! The Terror from Beyond Space,Space Battleship Yamato,It! The Terror from Beyond Space
2,Aliens,Aliens,Space Battleship Yamato,Forbidden Planet,Aliens
3,Rogue One: A Star Wars Story,Star Wreck: In the Pirkinning,Forbidden Planet,Dead Space: Downfall,Alien³
4,Alien³,Moonraker,Dead Space: Downfall,It! The Terror from Beyond Space,Lifeforce


In [30]:
result = recom_based_on_description('Jaws')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,Jaws 2,Jaws 2,The Land Unknown,The Land Unknown,Jaws 2
1,The Last Shark,The Last Shark,Jaws 2,Last Cannibal World,The Last Shark
2,The Shallows,The Shallows,The Lone Ranger,The Lone Ranger,The Shallows
3,Shark Tale,Shark Tale,Tremors 2: Aftershocks,Amazonia,The Cove
4,Sharkwater,Shark Kill,Amazonia,Godzilla,Shark Kill


In [31]:
result = recom_based_on_description('Looper')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,The Time Machine,The Time Machine,Homefront,Homefront,The Dead Zone
1,Joe Strummer: The Future Is Unwritten,Predestination,The Two Jakes,The Two Jakes,Thieves Like Us
2,Terminator Genisys,Back in Time,Resolution,Resolution,Repo Man
3,Back in Time,Joe Strummer: The Future Is Unwritten,The Italian Connection,Overheard 3,The Collection
4,Kim Possible: A Sitch in Time,Terminator Genisys,Leon: The Professional,Tango & Cash,Wrong Turn at Tahoe


### There isn't clear winner between embeding.

In [33]:
def get_n_recommendations(title=None, index=None, n=10,
                         weight={'w_wr': 0.05, 'genres': 1, 'cast': 0.35, 'cast_gender': 0.05, 'director': 0.5,
                                 'w_lang': 0.2, 'tfidf': 10, 'count': 10, 'emb': 10, 'dan': 10}):
    if title:
        idx = indices[title]
    else:
        idx = index
    
    # 
    result = recom_based_on_description(title=title, index=index)
    scaler = MinMaxScaler((0, 1))
    for col in ['tfidf', 'count', 'emb', 'emb_w', 'emb_dan']:
        result['sim_%s' % col] = scaler.fit_transform(result['sim_%s' % col].values.reshape(-1, 1))
        result['sim_%s' % col] -= result['sim_%s' % col].mean()
    
    original = df.loc[idx].copy()
    result['wr'] = result['wr'] * weight['w_wr'] + result['sim_tfidf'] * weight['tfidf'] +\
    result['sim_count'] * weight['count'] + result['sim_emb'] * weight['emb'] +\
    result['sim_emb_w'] * weight['emb'] + result['sim_emb_dan'] * weight['dan']
    
    
    for col in ['genres', 'cast', 'cast_gender']:
        list_col = ['{}_{}'.format(col, i) for i in range(1, 4)]
        # retrieve unique values from columns of movie we seek recommendation
        uniq_values = set(original[list_col])
        # remove Nan
        if np.nan in uniq_values:
            uniq_values.remove(np.nan)
        
        result['wr'] += result[list_col].isin(uniq_values).astype('int').sum(axis=1) * weight[col]
        
    # retrieve unique values from columns of movie we seek recommendation
    uniq_values = original['crew_Director']
    # remove Nan
    if uniq_values is not np.nan:
        result['wr'] += result['crew_Director'].isin([uniq_values]).astype('int') * weight['director']
        
    # retrieve unique values from columns of movie we seek recommendation
    uniq_values = original['original_language']
    # remove Nan
    if uniq_values is not np.nan:
        result['wr'] += result['original_language'].isin([uniq_values]).astype('int') * weight['w_lang']  
    
    return result.sort_values('wr', ascending=False)

In [34]:
default_weight = {'w_wr': 0.05, 'genres': 1, 'cast': 0.1, 'cast_gender': 0.25, 'director': 0.25,
                                 'w_lang': 0.2, 'tfidf': 5, 'count': 5, 'emb': 5, 'dan': 10}

In [35]:
get_n_recommendations('A Fistful of Dollars', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
4,Bandolero!,15.32241
2,For a Few Dollars More,15.239547
0,The Return of Ringo,14.144288
12,"The Good, the Bad and the Ugly",13.748889
20,One Man's Hero,13.090968
6,Johnny Yuma,13.017887
5,The Young Land,12.933311
1,¡Three Amigos!,12.637672
9,A Bullet for the General,12.212913
7,A Pistol for Ringo,12.211923


In [36]:
get_n_recommendations('Mad Max', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
3,Mad Max 2: The Road Warrior,16.939692
0,Mad Foxes,15.457569
4,Any Which Way You Can,13.53238
2,The Wild One,12.978371
9,Homefront,12.8909
13,Death Race,12.501351
5,Mad Max Beyond Thunderdome,12.236307
18,Mad Max: Fury Road,12.053067
1,Hells Angels Forever,12.030961
10,The Glory Stompers,11.776829


In [37]:
get_n_recommendations('Akira', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
7,Tokyo Tribe,14.183776
3,Mad Max,13.363085
12,Neo Tokyo,13.221195
0,The Wild One,13.134999
1,Tetsuo: The Iron Man,12.545131
5,Hells Angels Forever,11.296611
6,The Warriors,10.788384
132,Appleseed,10.768853
4,Mad Foxes,10.638804
8,The Glory Stompers,10.578152


In [38]:
get_n_recommendations('Die Hard', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
0,Die Hard 2,18.226368
1,Die Hard: With a Vengeance,15.609363
2,Live Free or Die Hard,14.520618
7,Face/Off,11.433297
42,Spooks: The Greater Good,10.832713
4,American Hostage,10.431141
104,Sleepless,10.141519
37,Air Force One,10.018218
571,The Fugitive,9.874324
20,Ransom,9.804224


In [39]:
df.loc[df['title'] == 'The Thing', ['index', 'cast_1', 'cast_2', 'cast_3']]

Unnamed: 0,index,cast_1,cast_2,cast_3
239,2156,Kurt Russell,Keith David,Wilford Brimley
14517,17835,Mary Elizabeth Winstead,Joel Edgerton,Ulrich Thomsen


In [40]:
get_n_recommendations(index=124, weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
3,Boy & the World,13.950384
1,Shipwrecked,12.519015
7,The Grass Harp,11.142716
5,Extremely Loud & Incredibly Close,11.128871
0,Air Mater,10.948423
6,A Summer at Grandpa's,10.7293
37,Pastoral: To Die in the Country,10.660008
16,The Red Balloon,10.634477
36,Mon oncle Antoine,10.438491
46,Phantom Boy,10.120918


In [41]:
get_n_recommendations('Pulp Fiction', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
8,The French Connection,13.759317
0,Vares: Private Eye,13.38133
11,Wrong Turn at Tahoe,12.817098
29,Fresh,11.424177
53,Killer Joe,11.400755
22,Trainspotting,11.080565
7,A Rage in Harlem,10.896285
70,The Outsider,10.732918
25,De Dominee,10.670628
16,Go,10.594491


In [42]:
get_n_recommendations('The Silence of the Lambs', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
0,Red Dragon,18.295182
2,Manhunter,17.032758
1,Hannibal,16.908767
3,Hannibal Rising,16.778686
4,Mindhunters,15.94887
7,Solace,15.191401
9,Faces in the Crowd,14.04336
11,Copycat,13.946152
6,Double Vision,13.496433
5,Switchback,13.169903


In [43]:
get_n_recommendations('Fargo', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
12,Breakdown,14.618338
1,The Disappearance of Alice Creed,14.488764
8,Room of Death,13.85173
103,Hell or High Water,13.330868
97,Charley Varrick,13.058062
30,Mean Dreams,12.869095
133,Appaloosa,12.807264
27,Gone in 60 Seconds,12.590495
58,U Turn,12.561524
7,The Mighty Quinn,12.290572


In [44]:
get_n_recommendations('Looper', weight=default_weight)[['title', 'wr']].head(10)

Unnamed: 0,title,wr
2,Terminator Genisys,14.721505
6,Minority Report,14.198851
5,Predestination,14.05685
0,The Time Machine,13.494719
36,In Time,12.138368
77,Trancers 3: Deth Lives,12.07417
4,Kim Possible: A Sitch in Time,11.444245
20,A Scanner Darkly,11.352135
48,Dredd,11.239949
8,X-Men: Days of Future Past,11.227212


### Recommendation seems to be reasonable.