In [1]:
import numpy as np
import pandas as pd
import requests

from similarity import calculate_sentence_embedding
from embeddings import Embeddings

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bencaterine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def get_wiki_text(movie):
    # retrieve a list of all words in a player's wikipedia page
    try:
        # query wikipedia api
        response = requests.get(
            'https://en.wikipedia.org/w/api.php',
            params={
                'action': 'query',
                'format': 'json',
                'titles': movie,
                'prop': 'extracts',
                'explaintext': True,
            }
        ).json()
        page = next(iter(response['query']['pages'].values()))
        text = page['extract']

        # if there are multiple wiki pages of the same name or the page found is not a film,
        # try adding 'film' identifier
        if 'may refer to:' in text or 'film' not in text:
            return get_wiki_text(movie + ' (film)')
        
        # eliminate punctuation
        for p in ['.', ',', '!', '?', '=', '(', ')', '"', '"', ':', ';', '\n']:
            text = text.replace(p, ' ')
        
        # get page vocabulary
        # text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
        return text
    
    except:
        # return empty list if failed to find player
        return []

In [3]:
movies = [
    'Man of Steel', 'Pitch Perfect', 'Pitch Perfect 2', 'The Dark Knight Rises', 'Iron Man'
]

In [4]:
# create pandas dataframe with players and their wiki text
movies_df = pd.DataFrame({
    'movie': movies,
})

In [5]:
# get wiki text for movies
movies_df['wiki_text'] = movies_df.movie.apply(get_wiki_text)
# add column for length of text (in words)
movies_df['len_text'] = movies_df.wiki_text.apply(len)

In [6]:
movies_df

Unnamed: 0,movie,wiki_text,len_text
0,Man of Steel,Man of Steel is a 2013 superhero film based on...,51868
1,Pitch Perfect,Pitch Perfect is a 2012 American musical comed...,16809
2,Pitch Perfect 2,Pitch Perfect 2 is a 2015 American musical com...,18841
3,The Dark Knight Rises,The Dark Knight Rises is a 2012 superhero film...,53345
4,Iron Man,Iron Man is a superhero appearing in American ...,112856


In [7]:
embeddings = Embeddings()

def get_all_similarities(m1):
    # calculates similarities between the given movie m1 and all movies in movies_df
    v1 = calculate_sentence_embedding(embeddings, m1, weighted=True)
    sims = []
    for m2 in movies_df.wiki_text:
        v2 = calculate_sentence_embedding(embeddings, m2, weighted=True)
        sim = embeddings.cosine_similarity(v1, v2)
        sims.append(sim)
    return sims

In [8]:
# put the similarities in a 2D matrix,
# where similarity[i, j] is the similarity score between the ith and jth movies in movies_df
similarities = np.stack(movies_df.wiki_text.apply(get_all_similarities).values)
similarities

array([[1.        , 0.97944157, 0.98191378, 0.99549933, 0.98214878],
       [0.97944157, 1.        , 0.99750574, 0.982988  , 0.95578871],
       [0.98191378, 0.99750574, 1.        , 0.98583924, 0.95606759],
       [0.99549933, 0.982988  , 0.98583924, 1.        , 0.98052437],
       [0.98214878, 0.95578871, 0.95606759, 0.98052437, 1.        ]])

In [9]:
def get_similarity(m1, m2):
    # get similarity score between the given movies m1 and m2, using the similarites matrix
    return similarities[movies_df.index[movies_df.movie == m1][0],
                        movies_df.index[movies_df.movie == m2][0]]
get_similarity('Pitch Perfect', 'Pitch Perfect 2')

0.9975057435747687