In [None]:
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import nltk
nltk.download('averaged_perceptron_tagger')

In [2]:
dataframe = pd.read_csv("wiki_movie_plots_deduped.csv")

In [3]:
print("Number of records:")
print(len(dataframe))

Number of records:
34886


In [4]:
list(dataframe.columns.values)

['Release Year',
 'Title',
 'Origin/Ethnicity',
 'Director',
 'Cast',
 'Genre',
 'Wiki Page',
 'Plot']

In [5]:
dataframe.columns = ['Release_Year',
 'Title',
 'Origin_Ethnicity',
 'Director',
 'Cast',
 'Genre',
 'Wiki_Page',
 'Plot']

In [6]:
max(dataframe['Release_Year'])

2017

In [7]:
dataframe['Origin_Ethnicity'].unique()

array(['American', 'Australian', 'Bangladeshi', 'British', 'Canadian',
       'Chinese', 'Egyptian', 'Hong Kong', 'Filipino', 'Assamese',
       'Bengali', 'Bollywood', 'Kannada', 'Malayalam', 'Marathi',
       'Punjabi', 'Tamil', 'Telugu', 'Japanese', 'Malaysian', 'Maldivian',
       'Russian', 'South_Korean', 'Turkish'], dtype=object)

In [8]:
# get movies from 1970 and onwards
updated_df = dataframe.query('Release_Year >= 1970')

# get movies from America, Australia, Britain and Canada
updated_df = updated_df.query('Origin_Ethnicity == "American" | Origin_Ethnicity == "Australian"' + 
                             '| Origin_Ethnicity == "British" | Origin_Ethnicity == "Canadian"')

In [9]:
updated_df = updated_df.reset_index(drop=True)

In [10]:
updated_df['Origin_Ethnicity'].unique()

array(['American', 'Australian', 'British', 'Canadian'], dtype=object)

In [11]:
# 1. get list of all plots
list_of_plots = updated_df['Plot'].tolist()

In [12]:
print("Number of movie plots:")
print(len(list_of_plots))

Number of movie plots:
11866


In [14]:
#tagged_sentence = nltk.tag.pos_tag(text1.split())
#edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
#new_sentence = ' '.join(edited_sentence)

In [15]:
# include proper nouns in stopwords, as arguably these are not needed when comparing similarities of
# movie plots. Perhaps cities are useful, but names such as Kevin are not good because their inclusion
# means that e.g. other movies than Home Alone with a character named Kevin will be deemed similar to
# Home Alone, even though the rest of the plot may be completely different.
updated_list_of_plots = []
for movie_plot_index in range(0, len(list_of_plots)):
    
    if(movie_plot_index%500 == 0):
        print(movie_plot_index)
    tagged_text = nltk.tag.pos_tag(list_of_plots[movie_plot_index].split())
    edited_text = [word for word,tag in tagged_text if tag != 'NNP' and tag != 'NNPS']
    
    # update each movie plot to have proper nouns removed
    edited_text = ' '.join(edited_text)
    updated_list_of_plots.append(edited_text)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500


In [16]:
# create list of stopwords from the English language
stop_words = list(ENGLISH_STOP_WORDS)
# create instance of tfidf vectoriser class based on list of stopwords
vectoriser = TfidfVectorizer(stop_words=stop_words)
# call fit function to tokenise (learn idf) and build vocabulary from corpus
vectoriser.fit(updated_list_of_plots)

# print vocabulary
# print(vectorizer.vocabulary_)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['whereas', 'however', 'eleven', 'beyond', 'next', 'via', 'had', 'up', 'whom', 'it', 'their', 'themselves', 'sometime', 'fifteen', 'serious', 'her', 'whereafter', 'became', 'hereby', 'whenever', 'have', 'here', 'its', 'become', 'whatever', 'made', 'two', 'still', 'many', 'eg', 'seem', 'te...on', 'either', 'much', 'under', 'always', 'side', 'nowhere', 'give', 'before', 'nothing', 'hundred'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
# vectorise each plot based on vectoriser made on movie plot set

list_of_vectors = []

for plot in updated_list_of_plots:
    # vectorise text based on vectoriser created earlier
    vector = vectoriser.transform([plot])
    list_of_vectors.append(vector)
    # summarize vectorised vector
    # print(vector.shape)
    # print(vector.toarray())

In [18]:
len(list_of_vectors)

11866

In [19]:
# append list of vectors to dataframe
updated_df['vector'] = list_of_vectors

In [20]:
# returns the cosine similarity of two vectorised texts
def compare_texts(vector1, vector2):
    
    cosineSimilarities = cosine_similarity(vector1, vector2)
    return cosineSimilarities

In [21]:
# returns a dataframe with 10 movies most similar to the input movie from the given dataframe
# NB: update function to have number of movies to return as an argument 
def get_top_n_similar_texts(n, vector, dataframe):
    
    list_of_similarities = []
    for index, row in dataframe.iterrows():
        similarity_measure = compare_texts(row['vector'], vector)
        list_of_similarities.append(similarity_measure)
        
    # append list of similarities to dataframe
    dataframe['similarity'] = list_of_similarities
    
    # sort dataframe based on similarity measure in descending order
    dataframe = dataframe.sort_values('similarity', ascending=False)
    
    # get top N rows of dataframe
    dataframe = dataframe.head(n)
    
    return dataframe

In [68]:
# Select movie here
updated_df.query('Title == "Home Alone"')

Unnamed: 0,Release_Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki_Page,Plot,vector
2913,1990,Home Alone,American,Chris Columbus,"Macaulay Culkin, Joe Pesci, Daniel Stern, John...","comedy, family",https://en.wikipedia.org/wiki/Home_Alone,The McCallister family is preparing to spend C...,"(0, 45985)\t0.06866099428996186\n (0, 45892..."


In [69]:
vector_index = 2913
vector_to_compare = updated_df['vector'].iloc[vector_index]
df_copy = updated_df.copy()
similarity_dataframe = get_top_n_similar_texts(20, vector_to_compare, df_copy)

In [70]:
updated_list_of_plots[2913]

'The family is preparing to spend in gathering at and home outside of on the night before their departure. and youngest son, eight-year-old is being ridiculed by his siblings and cousins. fight with his older brother, results in getting sent to the third floor of the house for punishment, where he wishes that his family would disappear. During the night, heavy winds cause damage to power lines, which causes a temporary power outage and resets the alarm clocks, causing the entire family to oversleep. In the confusion and rush to get to the airport, is accidentally left behind. wakes up to find the house empty and, thinking his wish has come true, is overjoyed with his newfound freedom. soon becomes frightened by his next door neighbor, who is rumored to have murdered his family with a snow shovel in 1958; as well as the and a pair of burglars who have been breaking into other vacant houses in the neighborhood and have targeted the house. tricks the pair into thinking his whole family is

In [71]:
similarity_dataframe

Unnamed: 0,Release_Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki_Page,Plot,vector,similarity
2913,1990,Home Alone,American,Chris Columbus,"Macaulay Culkin, Joe Pesci, Daniel Stern, John...","comedy, family",https://en.wikipedia.org/wiki/Home_Alone,The McCallister family is preparing to spend C...,"(0, 45985)\t0.06866099428996186\n (0, 45892...",[[1.0000000000000002]]
3294,1992,Home Alone 2: Lost in New York,American,Chris Columbus,"Macaulay Culkin, Joe Pesci, Daniel Stern",comedy,https://en.wikipedia.org/wiki/Home_Alone_2:_Lo...,"In Chicago, the McCallister family is preparin...","(0, 45985)\t0.0501963397889804\n (0, 45527)...",[[0.20610985168003237]]
5740,2003,The Fighting Temptations,American,Jonathan Lynn,"Cuba Gooding, Jr., Beyoncé Knowles","musical, comedy",https://en.wikipedia.org/wiki/The_Fighting_Tem...,"In the year 1980, a young boy named Darrin Hil...","(0, 45982)\t0.014287647406734827\n (0, 4590...",[[0.1900843644244443]]
2318,1987,Sorority House Massacre,American,Carol Frank,"Angela O'Neill, Wendy Martel",slasher,https://en.wikipedia.org/wiki/Sorority_House_M...,"When Beth (Angela O'Neill) is a little girl, h...","(0, 45983)\t0.0497262938698472\n (0, 45901)...",[[0.18102913784163527]]
10707,2012,Song for Marion,British,Director: Paul Andrew Williams,Director: Paul Andrew Williams\r\nCast: Terenc...,unknown,https://en.wikipedia.org/wiki/Song_for_Marion,"Arthur Harris is the grumpy husband of Marion,...","(0, 45982)\t0.024556378539595948\n (0, 4493...",[[0.1760573768701867]]
3055,1991,The Addams Family,American,Barry Sonnenfeld,"Anjelica Huston, Raúl Juliá, Christopher Lloyd...",comedy,https://en.wikipedia.org/wiki/The_Addams_Famil...,Gomez Addams laments the 25-year absence of hi...,"(0, 45901)\t0.026003663800794104\n (0, 4589...",[[0.17477861666273023]]
7794,2012,Joyful Noise,American,Todd Graff,"Queen Latifah, Dolly Parton, Keke Palmer, Jere...","comedy, musical",https://en.wikipedia.org/wiki/Joyful_Noise_(film),After the untimely death of a small-town churc...,"(0, 45982)\t0.01572768581884033\n (0, 45901...",[[0.1732335085322568]]
8461,2015,Monkey Kingdom,American,Mark Linfield Alastair Fothergill,Tina Fey,documentary,https://en.wikipedia.org/wiki/Monkey_Kingdom,Maya is a toque macaque whose world is changed...,"(0, 45642)\t0.07970125257120139\n (0, 45453...",[[0.1727374855569751]]
6082,2005,The Amityville Horror,American,Andrew Douglas,"Ryan Reynolds, Melissa George, Jesse James, Ji...",horror,https://en.wikipedia.org/wiki/The_Amityville_H...,"In 1974, Ronald DeFeo Jr. murdered his family ...","(0, 45892)\t0.03297264626786979\n (0, 44932...",[[0.17104718691467288]]
6495,2006,Monster House,American,Gil Kenan,"Mitchel Musso, Sam Lerner, Spencer Locke, Stev...",animation,https://en.wikipedia.org/wiki/Monster_House_(f...,The parents of twelve-year-old DJ Walters leav...,"(0, 45982)\t0.02431132463363516\n (0, 45901...",[[0.1710384947830055]]


In [None]:
vector_index = 89
vector_to_test = updated_df['vector'].iloc[vector_index]
max_cosine = 0.0
new_index = 0

for index, row in updated_df.iterrows():
    similarity = compare_texts(row['vector'], vector_to_test)
    if index == vector_index:
        continue
    if similarity > max_cosine:
        max_cosine = similarity
        new_index = index

In [None]:
new_index

In [None]:
updated_df.iloc[new_index]

In [None]:
# custom vector
custom_plot = "Child is left home alone during Christmas. Thieves break into his house as his parents are not home and he has to defend himself."
vector_custom = vectoriser.transform([custom_plot])

In [None]:
max_cosine = 0.0
new_index = 0

for index, row in updated_df.iterrows():
    similarity = compare_texts(row['vector'], vector_custom)
    
    if similarity > max_cosine:
        max_cosine = similarity
        new_index = index

In [None]:
max_cosine

In [None]:
updated_df.iloc[new_index]

In [None]:
# 2. vectorise each text using tfidf and attach list of vectorised texts to dataframe
# 3. input a given string (own plot) and vectorise using tfidf
# 3. use a for loop to go through each text and get cosine similarity of given string to each text in the dataframe.
# Return the most similar plot and its movie details