In [52]:
import numpy as np
import pandas as pd
import requests

from similarity import calculate_sentence_embedding
from embeddings import Embeddings

In [53]:
def get_wiki_text(movie):
    # retrieve a list of all words in a player's wikipedia page
    try:
        # query wikipedia api
        response = requests.get(
            'https://en.wikipedia.org/w/api.php',
            params={
                'action': 'query',
                'format': 'json',
                'titles': movie,
                'prop': 'extracts',
                'explaintext': True,
            }
        ).json()
        page = next(iter(response['query']['pages'].values()))
        text = page['extract']
        
        # eliminate punctuation
        for p in ['.', ',', '!', '?', '=', '(', ')', '"', '"', ':', ';', '\n']:
            text = text.replace(p, ' ')
        
        # get page vocabulary
        return text
    
    except:
        # return empty list if failed to find player
        return None

In [54]:
def apply_to_row(row):
    try:
        result = get_wiki_text(row.Title)
        # if there are multiple wiki pages of the same name or the page found is not a film,
        # try adding 'film' identifier
        if 'may refer to' in result or 'may more specifically refer to' in result or 'film' not in result[:500]:
            result = get_wiki_text(row.Title + ' (film)')
        if not result:
            result =  get_wiki_text(row.Title + ' (' + str(row.Year) + ' American film)')
        if not result:
            return get_wiki_text(row.Title + ' (' + str(row.Year) + ' film)')
        return result
    except:
        return None

In [55]:
movies_df = pd.read_csv('IMDB-Movie-Data.csv')
movies_df = movies_df[movies_df.Title != "Don't Fuck in the Woods"][:200]

In [56]:
movies_df.loc[movies_df.Title == 'Bahubali: The Beginning', 'Title'] = 'Baahubali: The Beginning'
movies_df.loc[movies_df.Title == '5/25/1977', 'Title'] = '5-25-77'
movies_df = movies_df[movies_df.Title != "Don't Fuck in the Woods"]
movies_df.loc[movies_df.Title == 'Jason Bourne', 'Title'] = 'Jason Bourne (film)'
movies_df.loc[movies_df.Title == 'Paris pieds nus', 'Title'] = 'Lost in Paris'
movies_df.loc[movies_df.Title == 'Star Wars: Episode VII - The Force Awakens'] = 'Star Wars: The Force Awakens'
movies_df.loc[movies_df.Title == 'Furious Seven'] = 'Furious 7'
movies_df.loc[movies_df.Title == 'Kimi no na wa'] = 'Your Name'

In [57]:
# get wiki text for movies
movies_df['wiki_text'] = movies_df.apply(apply_to_row, axis=1)
movies_df.dropna(subset=['wiki_text'], inplace=True)
movies_df.reset_index(inplace=True, drop=True)

In [58]:
# add column for length of text (in words)
movies_df['len_text'] = movies_df.wiki_text.apply(len)

In [59]:
movies_df.to_csv('movies_df.csv', index=False)

In [61]:
embeddings = Embeddings()

def get_all_similarities(m1):
    # calculates similarities between the given movie m1 and all movies in movies_df
    v1 = calculate_sentence_embedding(embeddings, m1, weighted=True)
    sims = []
    for m2 in movies_df.wiki_text:
        v2 = calculate_sentence_embedding(embeddings, m2, weighted=True)
        sim = embeddings.cosine_similarity(v1, v2)
        sims.append(sim)
    return sims

In [62]:
# put the similarities in a 2D matrix,
# where similarity[i, j] is the similarity score between the ith and jth movies in movies_df
similarities = np.stack(movies_df.wiki_text.apply(get_all_similarities).values)
similarities

array([[1.        , 0.99430656, 0.99122393, ..., 0.98470331, 0.97895589,
        0.99101794],
       [0.99430656, 1.        , 0.98798524, ..., 0.98777406, 0.97287752,
        0.99168435],
       [0.99122393, 0.98798524, 1.        , ..., 0.98511845, 0.98860963,
        0.98682302],
       ...,
       [0.98470331, 0.98777406, 0.98511845, ..., 1.        , 0.97268625,
        0.98078777],
       [0.97895589, 0.97287752, 0.98860963, ..., 0.97268625, 1.        ,
        0.97778029],
       [0.99101794, 0.99168435, 0.98682302, ..., 0.98078777, 0.97778029,
        1.        ]])

In [63]:
np.savetxt('similarities.csv', similarities, delimiter=',')

In [64]:
similarities.shape

(190, 190)

In [65]:
def get_similarity(m1, m2):
    # get similarity score between the given movies m1 and m2, using the similarites matrix
    return similarities[movies_df.index[movies_df.Title == m1][0],
                        movies_df.index[movies_df.Title == m2][0]]
get_similarity('The Dark Knight', 'Inception')

0.9928822029975909

In [66]:
def get_most_similar(movie, n=10):
    # return the n most similar movies to the given movie
    if movie not in movies_df.Title.values:
        return 'Movie not found'
    
    movie_index = movies_df.index[movies_df.Title == movie][0]
    movie_sims = similarities[movie_index]
    ind = movie_sims.argsort()[-(n+1):-1][::-1]
    return pd.DataFrame({
        'movies': movies_df.Title.values[ind],
        'similarity': movie_sims[ind]
    })

In [None]:
get_most_similar('Ant-Man')

Unnamed: 0,movies,similarity
0,Captain America: Civil War,0.997619
1,Deadpool,0.997459
2,Tomorrowland,0.997294
3,The Avengers,0.997281
4,Guardians of the Galaxy,0.997123
5,Watchmen,0.996967
6,Captain America: The First Avenger,0.996645
7,Avengers: Age of Ultron,0.996416
8,The Dark Knight Rises,0.995864
9,Divergent,0.995606
