In [1]:
# import libraries
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LONAB78\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# import dataframe from csv containing movies and their plots
dataframe = pd.read_csv("wiki_movie_plots_deduped.csv")

In [5]:
# get dataframe details
print("Number of records:")
print(len(dataframe))
print("Dataframe columns:")
print(list(dataframe.columns.values))

Number of records:
34886
Dataframe columns:
['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot']


In [6]:
# rename column names
dataframe.columns = ['Release_Year',
 'Title',
 'Origin_Ethnicity',
 'Director',
 'Cast',
 'Genre',
 'Wiki_Page',
 'Plot']

In [9]:
# get movies from 1970 and onwards
updated_df = dataframe.query('Release_Year >= 1970')

# get movies from the following origins: America, Australia, Britain and Canada
updated_df = updated_df.query('Origin_Ethnicity == "American" | Origin_Ethnicity == "Australian"' + 
                             '| Origin_Ethnicity == "British" | Origin_Ethnicity == "Canadian"')

updated_df = updated_df.reset_index(drop=True)

In [12]:
# get list of all plots
list_of_plots = updated_df['Plot'].tolist()

In [13]:
print("Number of movie plots:")
print(len(list_of_plots))

Number of movie plots:
11866


In [14]:
#tagged_sentence = nltk.tag.pos_tag(text1.split())
#edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
#new_sentence = ' '.join(edited_sentence)

In [15]:
# include proper nouns in stopwords, as arguably these are not needed when comparing similarities of
# movie plots. Perhaps cities are useful, but names such as Kevin are not good because their inclusion
# means that e.g. other movies than Home Alone with a character named Kevin will be deemed similar to
# Home Alone, even though the rest of the plot may be completely different.
updated_list_of_plots = []
for movie_plot_index in range(0, len(list_of_plots)):
    
    if(movie_plot_index%500 == 0):
        print(movie_plot_index)
    tagged_text = nltk.tag.pos_tag(list_of_plots[movie_plot_index].split())
    edited_text = [word for word,tag in tagged_text if tag != 'NNP' and tag != 'NNPS']
    
    # update each movie plot to have proper nouns removed
    edited_text = ' '.join(edited_text)
    updated_list_of_plots.append(edited_text)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500


In [None]:
# create list of stopwords from the English language
stop_words = list(ENGLISH_STOP_WORDS)
# create instance of tfidf vectoriser class based on list of stopwords
vectoriser = TfidfVectorizer(stop_words=stop_words)
# call fit function to tokenise (learn idf) and build vocabulary from corpus
vectoriser.fit(updated_list_of_plots)

# print vocabulary
# print(vectorizer.vocabulary_)

In [None]:
# vectorise each plot based on vectoriser made on movie plot set

list_of_vectors = []

for plot in updated_list_of_plots:
    # vectorise text based on vectoriser created earlier
    vector = vectoriser.transform([plot])
    list_of_vectors.append(vector)
    # summarize vectorised vector
    # print(vector.shape)
    # print(vector.toarray())

In [None]:
len(list_of_vectors)

In [None]:
# append list of vectors to dataframe
updated_df['vector'] = list_of_vectors

In [None]:
# returns the cosine similarity of two vectorised texts
def compare_texts(vector1, vector2):
    
    cosineSimilarities = cosine_similarity(vector1, vector2)
    return cosineSimilarities

In [None]:
# returns a dataframe with 10 movies most similar to the input movie from the given dataframe
# NB: update function to have number of movies to return as an argument 
def get_top_n_similar_texts(n, vector, dataframe):
    
    list_of_similarities = []
    for index, row in dataframe.iterrows():
        similarity_measure = compare_texts(row['vector'], vector)
        list_of_similarities.append(similarity_measure)
        
    # append list of similarities to dataframe
    dataframe['similarity'] = list_of_similarities
    
    # sort dataframe based on similarity measure in descending order
    dataframe = dataframe.sort_values('similarity', ascending=False)
    
    # get top N rows of dataframe
    dataframe = dataframe.head(n)
    
    return dataframe

In [None]:
# Select movie here
updated_df.query('Title == "Toy Story"')

In [None]:
vector_index = 4066
vector_to_compare = updated_df['vector'].iloc[vector_index]
df_copy = updated_df.copy()
similarity_dataframe = get_top_n_similar_texts(10, vector_to_compare, df_copy)

In [None]:
updated_list_of_plots[4066]

In [None]:
similarity_dataframe

In [None]:
vector_index = 89
vector_to_test = updated_df['vector'].iloc[vector_index]
max_cosine = 0.0
new_index = 0

for index, row in updated_df.iterrows():
    similarity = compare_texts(row['vector'], vector_to_test)
    if index == vector_index:
        continue
    if similarity > max_cosine:
        max_cosine = similarity
        new_index = index

In [None]:
new_index

In [None]:
updated_df.iloc[new_index]

In [None]:
# custom vector
custom_plot = "Child is left home alone during Christmas. Thieves break into his house as his parents are not home and he has to defend himself."
vector_custom = vectoriser.transform([custom_plot])

In [None]:
max_cosine = 0.0
new_index = 0

for index, row in updated_df.iterrows():
    similarity = compare_texts(row['vector'], vector_custom)
    
    if similarity > max_cosine:
        max_cosine = similarity
        new_index = index

In [None]:
max_cosine

In [None]:
updated_df.iloc[new_index]

In [None]:
# 2. vectorise each text using tfidf and attach list of vectorised texts to dataframe
# 3. input a given string (own plot) and vectorise using tfidf
# 3. use a for loop to go through each text and get cosine similarity of given string to each text in the dataframe.
# Return the most similar plot and its movie details