In [190]:
import numpy as np
import pandas as pd
import requests

from similarity import calculate_sentence_embedding
from embeddings import Embeddings

In [191]:
def get_wiki_text(movie):
    # retrieve a list of all words in a player's wikipedia page
    try:
        # query wikipedia api
        response = requests.get(
            'https://en.wikipedia.org/w/api.php',
            params={
                'action': 'query',
                'format': 'json',
                'titles': movie,
                'prop': 'extracts',
                'explaintext': True,
            }
        ).json()
        page = next(iter(response['query']['pages'].values()))
        text = page['extract']
        
        # eliminate punctuation
        for p in ['.', ',', '!', '?', '=', '(', ')', '"', '"', ':', ';', '\n']:
            text = text.replace(p, ' ')
        
        # get page vocabulary
        # text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
        return text
    
    except:
        # return empty list if failed to find player
        return None

In [192]:
def apply_to_row(row):
    result = get_wiki_text(row.Title)
    # if there are multiple wiki pages of the same name or the page found is not a film,
    # try adding 'film' identifier
    if 'may refer to' in result or 'may more specifically refer to' in result or 'film' not in result[:500]:
        result = get_wiki_text(row.Title + ' (film)')
    if not result:
        result =  get_wiki_text(row.Title + ' (' + str(row.Year) + ' American film)')
    if not result:
        return get_wiki_text(row.Title + ' (' + str(row.Year) + ' film)')
    return result

In [193]:
movies_df = pd.read_csv('IMDB-Movie-Data.csv')
movies_df = movies_df[movies_df.Title != "Don't Fuck in the Woods"][:100]

In [194]:
movies_df.loc[movies_df.Title == 'Bahubali: The Beginning', 'Title'] = 'Baahubali: The Beginning'
movies_df.loc[movies_df.Title == '5/25/1977', 'Title'] = '5-25-77'
movies_df = movies_df[movies_df.Title != "Don't Fuck in the Woods"]
movies_df.loc[movies_df.Title == 'Jason Bourne', 'Title'] = 'Jason Bourne (film)'
movies_df.loc[movies_df.Title == 'Paris pieds nus', 'Title'] = 'Lost in Paris'
movies_df.loc[movies_df.Title == 'Star Wars: Episode VII - The Force Awakens'] = 'Star Wars: The Force Awakens'
movies_df.loc[movies_df.Title == 'Furious Seven'] = 'Furious 7'
movies_df.loc[movies_df.Title == 'Kimi no na wa'] = 'Your Name'

In [195]:
# get wiki text for movies
movies_df['wiki_text'] = movies_df.apply(apply_to_row, axis=1)

In [196]:
movies_df.wiki_text

0      Guardians of the Galaxy  retroactively referre...
1      Prometheus   prə-MEE-thee-əs  is a 2012 scienc...
2      Split is a 2016 American psychological thrille...
3      Sing is a 2016 American computer-animated juke...
4      Suicide Squad is a 2016 American superhero fil...
                             ...                        
96     Your Name  Japanese  君の名は。  Hepburn  Kimi no N...
97     The Void is a 2016 Canadian Lovecraftian horro...
98     Personal Shopper is a 2016 supernatural psycho...
99     The Departed is a 2006 American crime thriller...
100    Legend is a 2015 biographical crime thriller f...
Name: wiki_text, Length: 100, dtype: object

In [197]:
# add column for length of text (in words)
movies_df['len_text'] = movies_df.wiki_text.apply(len)

In [199]:
embeddings = Embeddings()

def get_all_similarities(m1):
    # calculates similarities between the given movie m1 and all movies in movies_df
    v1 = calculate_sentence_embedding(embeddings, m1, weighted=True)
    sims = []
    for m2 in movies_df.wiki_text:
        v2 = calculate_sentence_embedding(embeddings, m2, weighted=True)
        sim = embeddings.cosine_similarity(v1, v2)
        sims.append(sim)
    return sims

In [200]:
# put the similarities in a 2D matrix,
# where similarity[i, j] is the similarity score between the ith and jth movies in movies_df
similarities = np.stack(movies_df.wiki_text.apply(get_all_similarities).values)
similarities

array([[1.        , 0.99430632, 0.99122393, ..., 0.97278477, 0.98394104,
        0.9757157 ],
       [0.99430632, 1.        , 0.9879826 , ..., 0.96764464, 0.97219294,
        0.9661313 ],
       [0.99122393, 0.9879826 , 1.        , ..., 0.9818985 , 0.98375723,
        0.9836339 ],
       ...,
       [0.97278477, 0.96764464, 0.9818985 , ..., 1.        , 0.97914038,
        0.97208144],
       [0.98394104, 0.97219294, 0.98375723, ..., 0.97914038, 1.        ,
        0.98574878],
       [0.9757157 , 0.9661313 , 0.9836339 , ..., 0.97208144, 0.98574878,
        1.        ]])

In [203]:
def get_similarity(m1, m2):
    # get similarity score between the given movies m1 and m2, using the similarites matrix
    return similarities[movies_df.index[movies_df.Title == m1][0],
                        movies_df.index[movies_df.Title == m2][0]]
get_similarity('The Dark Knight', 'Inception')

0.9868995484854834

In [220]:
def get_most_similar(movie, n=10):
    # return the n most similar movies to the given movie
    if movie not in movies_df.Title.values:
        return 'Movie not found'
    
    movie_index = movies_df.index[movies_df.Title == movie][0]
    movie_sims = similarities[movie_index]
    ind = movie_sims.argsort()[-(n+1):-1][::-1]
    return pd.DataFrame({
        'movies': movies_df.Title.values[ind],
        'similarity': movie_sims[ind]
    })

In [237]:
get_most_similar('The Hateful Eight')

Unnamed: 0,movies,similarity
0,Miss Sloane,0.991498
1,Jason Bourne (film),0.991022
2,The Founder,0.990856
3,Live by Night,0.990745
4,Suicide Squad,0.990593
5,Fifty Shades of Grey,0.990471
6,Gold,0.989733
7,Kingsman: The Secret Service,0.98904
8,Star Wars: The Force Awakens,0.988689
9,War Dogs,0.988047
