In [1]:
import os
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
import tensorflow_hub as hub

DATA_PATH = './data'

  return f(*args, **kwds)


In [2]:
os.listdir(DATA_PATH)

['glove.twitter.27B.100d.txt',
 'keywords.csv',
 'links.csv',
 'glove.6B.50d.txt',
 'clean.csv',
 'ratings.csv',
 'glove.twitter.27B.50d.txt',
 'glove.twitter.27B.25d.txt',
 'glove.twitter.27B.200d.txt',
 'crawl-300d-2M.vec',
 'credits.csv',
 'ratings.csv.bin',
 'ratings_small.csv',
 'links_small.csv',
 'movies_metadata.csv']

In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, 'clean.csv'))

In [4]:
df.columns

Index(['id', 'budget', 'original_language', 'popularity', 'revenue', 'runtime',
       'title', 'vote_average', 'vote_count', 'year', 'month', 'day_of_week',
       'success', 'country_name', 'franchise_name', 'companie_name',
       'crew_Director', 'crew_Screenplay', 'crew_Producer',
       'crew_Executive Producer', 'crew_Music', 'crew_Director of Photography',
       'cast_1', 'cast_gender_1', 'cast_2', 'cast_gender_2', 'cast_3',
       'cast_gender_3', 'genres_1', 'genres_2', 'genres_3', 'cast_count',
       'crew_count', 'country_count', 'company_count', 'description',
       'keywords'],
      dtype='object')

#### To work with vote_count and vote_average features we will use weighted rating
### $ rating = \frac{vote\_count}{vote\_count\  +\  m}\  vote\_average + \frac{m}{vote\_count\  +\  m}\  M$
#### where M is mean of vote_average for all movies and m is parameter which is control how we relieve on users scores 
* #### If m equal vote_count when rating is average between M and vote_average
* #### If m is big comapred to vote_count when rating is almost M
* #### If m is small comapred to vote_count when rating is almost vote_average

In [5]:
# mean user's score of all movies
M = df['vote_average'].mean()

# take 85 percentile of vote_counts as a reliable amount of votes
m = df['vote_count'].quantile(0.75)

In [6]:
def weighted_rating(x):
    v = x['vote_count']
    return (v / (v+m) * x['vote_average']) + (m / (m+v) * M)

In [7]:
df['wr'] = df.apply(weighted_rating, axis=1)
df = df.sort_values('wr', ascending=False)

In [8]:
# there is no movies with high score and small number of votes, as expected 
df[['title', 'wr', 'vote_average', 'vote_count']].head()

Unnamed: 0,title,wr,vote_average,vote_count
10275,Dilwale Dulhania Le Jayenge,8.926098,9.1,661.0
311,The Shawshank Redemption,8.488081,8.5,8358.0
818,The Godfather,8.48349,8.5,6024.0
40111,Your Name.,8.40607,8.5,1030.0
12444,The Dark Knight,8.292439,8.3,12269.0


In [9]:
# remove bad movies
df = df.loc[df['wr'] >= 5.75]
# scale from 0 to 1
df['wr'] = (df['wr'] - df['wr'].mean()) / (df['wr'].max() - df['wr'].min())
# number of movie left
df.shape

(19487, 38)

In [10]:
# drop na
df = df.loc[df['crew_Director'].notna() \
            & df['cast_1'].notna() & df['cast_2'].notna() & df['cast_3'].notna() \
            & df['cast_gender_1'].notna() & df['cast_gender_2'].notna() & df['cast_gender_3'].notna()
            & df['genres_1'].notna()]

# add key words from keywords.csv file
df['description'] = df['description'] + df['keywords'].fillna('')

In [11]:
# create Series
df = df.reset_index()
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

In [12]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.85)
tfidf_matrix = tfidf.fit_transform(df['description'])
delattr(tfidf, 'stop_words_')
# tf-idf dict
dictionary = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

cosine_sim_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)
del tfidf_matrix, tfidf
gc.collect()

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.85)
count_matrix = vectorizer.fit_transform(df['description'])
delattr(vectorizer, 'stop_words_')

cosine_sim_count = cosine_similarity(count_matrix, count_matrix)
del count_matrix, vectorizer
gc.collect();

In [13]:
# all of them give pretty the same result
[file for file in os.listdir('data') if file.startswith('glove') or file.startswith('crawl')]

['glove.twitter.27B.100d.txt',
 'glove.6B.50d.txt',
 'glove.twitter.27B.50d.txt',
 'glove.twitter.27B.25d.txt',
 'glove.twitter.27B.200d.txt',
 'crawl-300d-2M.vec']

In [14]:
emb_name = 'glove.twitter.27B.50d.txt'

# all embedding
embeddings_index = {}
# embedding multiplied by tf-idf weight
embeddings_index_w = {}
with open(os.path.join(DATA_PATH, emb_name), encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
        
        weight = dictionary.get(word)
        if weight:
            embeddings_index_w[word] = np.asarray(values[1:], dtype='float32') * weight

In [15]:
def find_mean_embbeding(x, emb):
    mean_emb = []
    for word in x.split(' '):
        e = emb.get(word)
        if e is not None:
            mean_emb.append(e)
    return sum(mean_emb) / len(mean_emb) if len(mean_emb) else embeddings_index['none']

In [16]:
df['mean_embbeding'] = df['description'].apply(find_mean_embbeding, emb=embeddings_index)
df['mean_embbeding_w'] = df['description'].apply(find_mean_embbeding, emb=embeddings_index_w)

del embeddings_index, embeddings_index_w
gc.collect();

In [17]:
embed_matrix = np.zeros(shape=(len(df['mean_embbeding']), df['mean_embbeding'].iloc[0].shape[0]))
for i, vector in enumerate(df['mean_embbeding']):
    embed_matrix[i, :] = vector.T
    
cosine_sim_emb = cosine_similarity(embed_matrix, embed_matrix)
del embed_matrix
gc.collect()

embed_matrix = np.zeros(shape=(len(df['mean_embbeding_w']), df['mean_embbeding_w'].iloc[0].shape[0]))
for i, vector in enumerate(df['mean_embbeding_w']):
    embed_matrix[i, :] = vector.T
    
cosine_sim_emb_w = cosine_similarity(embed_matrix, embed_matrix)
del embed_matrix
gc.collect();

In [18]:
# universalsentence encoder embeddings DAN 512 
tf.logging.set_verbosity(tf.logging.ERROR)
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
embeddings = embed(df['description'].values)

In [19]:
init = tf.global_variables_initializer()
table_init = tf.tables_initializer()

with tf.Session() as sess:
    sess.run([init, table_init])
    embeddings_dan = sess.run(embeddings)

cosine_sim_emb_dan = cosine_similarity(embeddings_dan, embeddings_dan)
del embeddings_dan
gc.collect();

In [20]:
def recom_based_on_description(title=None, index=None):
    if title:
        idx = indices[title]
    else:
        idx = index

    result = None 
    l = ['tfidf', 'count', 'emb', 'emb_w', 'emb_dan']
    for i, sim in enumerate([cosine_sim_tfidf, cosine_sim_count, cosine_sim_emb, cosine_sim_emb_w, 
                             cosine_sim_emb_dan]):
        sim_scores = list(enumerate(sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:]
        movie_indices = [i[0] for i in sim_scores]
        movie_scores = [i[1] for i in sim_scores]

        if result is None:
            result = df.loc[movie_indices].copy()
            result['sim_%s' % l[i]] = movie_scores
        else:
            result2 = df.loc[movie_indices, ['index']].copy()
            result2['sim_%s' % l[i]] = movie_scores
            result = result.merge(result2, on='index')
    del result2
    
    return result

def show_top_n(result, n=5):
    l = ['tfidf', 'count', 'emb', 'emb_w', 'emb_dan']
    return pd.concat([result[['title', 'sim_%s' % col]]\
                      .sort_values(by='sim_%s' % col, ascending=False)\
                      .head(n).reset_index().rename(columns={'title': 'title_%s' % col})['title_%s' % col] 
                      for col in l], axis=1)

In [21]:
result = recom_based_on_description('Pulp Fiction')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,Vares: Private Eye,How to Make Money Selling Drugs,Pink Flamingos,Kill!,Tango & Cash
1,Artists and Models,Wrong Turn at Tahoe,Kill!,Pink Flamingos,Le Cercle Rouge
2,The Confession,Chiko,The Maltese Falcon,The Maltese Falcon,The Outsider
3,How to Make Money Selling Drugs,Go,Mystery Team,Cast a Deadly Spell,Dillinger
4,The Wackness,The French Connection,Cast a Deadly Spell,Mr. & Mrs. Smith,Bugsy


In [22]:
result = recom_based_on_description('Alien')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,Star Wreck: In the Pirkinning,Aliens,It! The Terror from Beyond Space,Space Battleship Yamato,Forbidden Planet
1,Aliens,Star Wreck: In the Pirkinning,Space Battleship Yamato,Forbidden Planet,It! The Terror from Beyond Space
2,Moonraker,Moonraker,Forbidden Planet,Dead Space: Downfall,Aliens
3,Rogue One: A Star Wars Story,Rogue One: A Star Wars Story,Dead Space: Downfall,It! The Terror from Beyond Space,Alien³
4,Passengers,Alien³,Oblivion,Waterworld,Lifeforce


In [23]:
result = recom_based_on_description('Jaws')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,Jaws 2,Jaws 2,The Land Unknown,The Land Unknown,Jaws 2
1,The Last Shark,The Last Shark,Jaws 2,Last Cannibal World,The Last Shark
2,The Shallows,The Shallows,Tremors 2: Aftershocks,Amazonia,The Shallows
3,Shark Tale,Shark Tale,Amazonia,Godzilla,The Cove
4,Sharkwater,Shark Kill,Last Cannibal World,Mothra,Shark Kill


In [24]:
result = recom_based_on_description('Looper')
show_top_n(result)

Unnamed: 0,title_tfidf,title_count,title_emb,title_emb_w,title_emb_dan
0,The Time Machine,The Time Machine,Homefront,Homefront,The Dead Zone
1,Joe Strummer: The Future Is Unwritten,Predestination,The Two Jakes,The Two Jakes,Thieves Like Us
2,Terminator Genisys,Back in Time,Resolution,Resolution,Repo Man
3,Back in Time,Terminator Genisys,The Italian Connection,Overheard 3,The Collection
4,Kim Possible: A Sitch in Time,Joe Strummer: The Future Is Unwritten,Leon: The Professional,Tango & Cash,Wrong Turn at Tahoe


### There isn't clear winner between embeding.

In [25]:
def get_n_recommendations(title=None, index=None, n=10,
                         weight={'w_wr': 0.05, 'genres': 1, 'cast': 0.35, 'cast_gender': 0.05, 'director': 0.5,
                                 'w_lang': 0.2, 'tfidf': 10, 'count': 10, 'emb': 10, 'dan': 10}):
    if title:
        idx = indices[title]
    else:
        idx = index
    
    # 
    result = recom_based_on_description(title=title, index=index)
    scaler = MinMaxScaler((0, 1))
    for col in ['tfidf', 'count', 'emb', 'emb_w', 'emb_dan']:
        result['sim_%s' % col] = scaler.fit_transform(result['sim_%s' % col].values.reshape(-1, 1))
        result['sim_%s' % col] -= result['sim_%s' % col].mean()
    
    original = df.loc[idx].copy()
    result['wr'] = result['wr'] * weight['w_wr'] + result['sim_tfidf'] * weight['tfidf'] +\
    result['sim_count'] * weight['count'] + result['sim_emb'] * weight['emb'] +\
    result['sim_emb_w'] * weight['emb'] + result['sim_emb_dan'] * weight['dan']
    
    
    for col in ['genres', 'cast', 'cast_gender']:
        list_col = ['{}_{}'.format(col, i) for i in range(1, 4)]
        # retrieve unique values from columns of movie we seek recommendation
        uniq_values = set(original[list_col])
        # remove Nan
        if np.nan in uniq_values:
            uniq_values.remove(np.nan)
        
        result['wr'] += result[list_col].isin(uniq_values).astype('int').sum(axis=1) * weight[col]
        
    # retrieve unique values from columns of movie we seek recommendation
    uniq_values = original['crew_Director']
    # remove Nan
    if uniq_values is not np.nan:
        result['wr'] += result['crew_Director'].isin([uniq_values]).astype('int') * weight['director']
        
    # retrieve unique values from columns of movie we seek recommendation
    uniq_values = original['original_language']
    # remove Nan
    if uniq_values is not np.nan:
        result['wr'] += result['original_language'].isin([uniq_values]).astype('int') * weight['w_lang']  
    
    return result.sort_values('wr', ascending=False)

In [26]:
def save_table(df, title, folder='recommendations'):
    """Save DataFrame df as png image in directory folder with name title"""
    fig, ax = plt.subplots(figsize=(9, 5))
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_frame_on(False)

    plt.title(title, size=16)
    tabla = pd.plotting.table(ax, df, loc='center')
    tabla.auto_set_font_size(False)
    tabla.set_fontsize(12)
    tabla.scale(1.2, 2)

    plt.savefig(os.path.join(folder, title + '.png'))
    plt.close()

In [27]:
default_weight = {'w_wr': 0.05, 'genres': 1, 'cast': 0.1, 'cast_gender': 0.25, 'director': 0.25,
                                 'w_lang': 0.2, 'tfidf': 5, 'count': 5, 'emb': 5, 'dan': 10}

### Check and save some recommendation

In [28]:
movie_title = 'A Fistful of Dollars'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,For a Few Dollars More,15.4
1,Bandolero!,15.2
2,The Return of Ringo,14.2
3,"The Good, the Bad and the Ugly",13.8
4,One Man's Hero,13.0
5,¡Three Amigos!,12.6
6,A Bullet for the General,12.2
7,The Five Man Army,12.1
8,Death Rides a Horse,11.4
9,Keoma,11.4


In [29]:
movie_title = 'Mad Max'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,Mad Max 2: The Road Warrior,16.8
1,Mad Foxes,15.4
2,Any Which Way You Can,13.4
3,The Wild One,13.0
4,Homefront,13.0
5,Death Race,12.4
6,Mad Max Beyond Thunderdome,12.2
7,Mad Max: Fury Road,12.0
8,The Glory Stompers,11.7
9,Turbo Kid,11.5


In [30]:
movie_title = 'Akira'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,Tokyo Tribe,14.1
1,Mad Max,13.1
2,The Wild One,13.1
3,Neo Tokyo,13.0
4,Tetsuo: The Iron Man,12.2
5,Appleseed,10.7
6,The Warriors,10.7
7,Mad Foxes,10.5
8,The Glory Stompers,10.4
9,Any Which Way You Can,10.2


In [31]:
movie_title = 'Die Hard'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,Die Hard 2,18.3
1,Die Hard: With a Vengeance,15.8
2,Live Free or Die Hard,14.7
3,Face/Off,11.5
4,American Hostage,11.0
5,Spooks: The Greater Good,10.9
6,Sleepless,10.2
7,Air Force One,10.1
8,The Fugitive,9.9
9,Lethal Weapon,9.9


In [32]:
# there are two movies wit name 'The Thing' so we will use index instead of title
df.loc[df['title'] == 'The Thing', ['index', 'cast_1', 'cast_2', 'cast_3']]

Unnamed: 0,index,cast_1,cast_2,cast_3
154,2156,Kurt Russell,Keith David,Wilford Brimley
9509,17835,Mary Elizabeth Winstead,Joel Edgerton,Ulrich Thomsen


In [33]:
movie_title = 'The Thing'
_temp = get_n_recommendations(index=154, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,The Thing,17.8
1,Alien Nation,16.3
2,Alien Abduction: Incident in Lake County,15.9
3,It! The Terror from Beyond Space,15.9
4,Attraction,15.9
5,The Hidden,15.1
6,Invasion of the Body Snatchers,14.8
7,The Thing from Another World,14.7
8,Aliens,14.5
9,Invasion of the Body Snatchers,14.4


In [34]:
movie_title = 'Pulp Fiction'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,The French Connection,13.7
1,Vares: Private Eye,13.3
2,Wrong Turn at Tahoe,12.9
3,Fresh,11.4
4,Killer Joe,11.3
5,Trainspotting,11.0
6,De Dominee,10.7
7,The Outsider,10.6
8,Go,10.5
9,The Sting,10.5


In [35]:
movie_title = 'The Silence of the Lambs'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,Red Dragon,18.2
1,Manhunter,16.9
2,Hannibal,16.7
3,Hannibal Rising,16.7
4,Mindhunters,15.9
5,Solace,15.1
6,Copycat,13.9
7,Faces in the Crowd,13.9
8,Double Vision,13.3
9,The Bird with the Crystal Plumage,12.9


In [36]:
movie_title = 'Fargo'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,Breakdown,15.0
1,The Disappearance of Alice Creed,14.9
2,Room of Death,14.2
3,Hell or High Water,13.8
4,Charley Varrick,13.6
5,Appaloosa,13.4
6,Mean Dreams,13.3
7,U Turn,13.1
8,Gone in 60 Seconds,12.8
9,Walking Tall,12.5


In [37]:
movie_title = 'Looper'
_temp = get_n_recommendations(movie_title, weight=default_weight)[['title', 'wr']].head(10).\
reset_index(drop=True).rename(columns={'wr': 'similarity rating'})
_temp['similarity rating'] = np.round(_temp['similarity rating'], 1)
save_table(df=_temp, title=movie_title);
_temp

Unnamed: 0,title,similarity rating
0,Terminator Genisys,14.8
1,Minority Report,14.1
2,Predestination,14.1
3,The Time Machine,13.4
4,In Time,12.1
5,Kim Possible: A Sitch in Time,11.3
6,A Scanner Darkly,11.2
7,Dredd,11.2
8,THX 1138,11.1
9,X-Men: Days of Future Past,11.1


### Recommendation seems to be reasonable.