In [None]:
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import nltk
nltk.download('averaged_perceptron_tagger')

In [2]:
dataframe = pd.read_csv("wiki_movie_plots_deduped.csv")

In [3]:
print("Number of records:")
print(len(dataframe))

Number of records:
34886


In [4]:
list(dataframe.columns.values)

['Release Year',
 'Title',
 'Origin/Ethnicity',
 'Director',
 'Cast',
 'Genre',
 'Wiki Page',
 'Plot']

In [5]:
dataframe.columns = ['Release_Year',
 'Title',
 'Origin_Ethnicity',
 'Director',
 'Cast',
 'Genre',
 'Wiki_Page',
 'Plot']

In [6]:
max(dataframe['Release_Year'])

2017

In [7]:
dataframe['Origin_Ethnicity'].unique()

array(['American', 'Australian', 'Bangladeshi', 'British', 'Canadian',
       'Chinese', 'Egyptian', 'Hong Kong', 'Filipino', 'Assamese',
       'Bengali', 'Bollywood', 'Kannada', 'Malayalam', 'Marathi',
       'Punjabi', 'Tamil', 'Telugu', 'Japanese', 'Malaysian', 'Maldivian',
       'Russian', 'South_Korean', 'Turkish'], dtype=object)

In [8]:
# get movies from 1970 and onwards
updated_df = dataframe.query('Release_Year >= 1970')

# get movies from America, Australia, Britain and Canada
updated_df = updated_df.query('Origin_Ethnicity == "American" | Origin_Ethnicity == "Australian"' + 
                             '| Origin_Ethnicity == "British" | Origin_Ethnicity == "Canadian"')

In [9]:
updated_df = updated_df.reset_index(drop=True)

In [10]:
updated_df['Origin_Ethnicity'].unique()

array(['American', 'Australian', 'British', 'Canadian'], dtype=object)

In [11]:
# 1. get list of all plots
list_of_plots = updated_df['Plot'].tolist()

In [12]:
print("Number of movie plots:")
print(len(list_of_plots))

Number of movie plots:
11866


In [14]:
#tagged_sentence = nltk.tag.pos_tag(text1.split())
#edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
#new_sentence = ' '.join(edited_sentence)

In [15]:
# include proper nouns in stopwords, as arguably these are not needed when comparing similarities of
# movie plots. Perhaps cities are useful, but names such as Kevin are not good because their inclusion
# means that e.g. other movies than Home Alone with a character named Kevin will be deemed similar to
# Home Alone, even though the rest of the plot may be completely different.
updated_list_of_plots = []
for movie_plot_index in range(0, len(list_of_plots)):
    
    if(movie_plot_index%500 == 0):
        print(movie_plot_index)
    tagged_text = nltk.tag.pos_tag(list_of_plots[movie_plot_index].split())
    edited_text = [word for word,tag in tagged_text if tag != 'NNP' and tag != 'NNPS']
    
    # update each movie plot to have proper nouns removed
    edited_text = ' '.join(edited_text)
    updated_list_of_plots.append(edited_text)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500


In [16]:
# create list of stopwords from the English language
stop_words = list(ENGLISH_STOP_WORDS)
# create instance of tfidf vectoriser class based on list of stopwords
vectoriser = TfidfVectorizer(stop_words=stop_words)
# call fit function to tokenise (learn idf) and build vocabulary from corpus
vectoriser.fit(updated_list_of_plots)

# print vocabulary
# print(vectorizer.vocabulary_)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['whereas', 'however', 'eleven', 'beyond', 'next', 'via', 'had', 'up', 'whom', 'it', 'their', 'themselves', 'sometime', 'fifteen', 'serious', 'her', 'whereafter', 'became', 'hereby', 'whenever', 'have', 'here', 'its', 'become', 'whatever', 'made', 'two', 'still', 'many', 'eg', 'seem', 'te...on', 'either', 'much', 'under', 'always', 'side', 'nowhere', 'give', 'before', 'nothing', 'hundred'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
# vectorise each plot based on vectoriser made on movie plot set

list_of_vectors = []

for plot in updated_list_of_plots:
    # vectorise text based on vectoriser created earlier
    vector = vectoriser.transform([plot])
    list_of_vectors.append(vector)
    # summarize vectorised vector
    # print(vector.shape)
    # print(vector.toarray())

In [18]:
len(list_of_vectors)

11866

In [19]:
# append list of vectors to dataframe
updated_df['vector'] = list_of_vectors

In [20]:
# returns the cosine similarity of two vectorised texts
def compare_texts(vector1, vector2):
    
    cosineSimilarities = cosine_similarity(vector1, vector2)
    return cosineSimilarities

In [21]:
# returns a dataframe with 10 movies most similar to the input movie from the given dataframe
# NB: update function to have number of movies to return as an argument 
def get_top_n_similar_texts(n, vector, dataframe):
    
    list_of_similarities = []
    for index, row in dataframe.iterrows():
        similarity_measure = compare_texts(row['vector'], vector)
        list_of_similarities.append(similarity_measure)
        
    # append list of similarities to dataframe
    dataframe['similarity'] = list_of_similarities
    
    # sort dataframe based on similarity measure in descending order
    dataframe = dataframe.sort_values('similarity', ascending=False)
    
    # get top N rows of dataframe
    dataframe = dataframe.head(n)
    
    return dataframe

In [50]:
# Select movie here
updated_df.query('Title == "Toy Story"')

Unnamed: 0,Release_Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki_Page,Plot,vector
4066,1995,Toy Story,American,John Lasseter,"Tim Allen, Tom Hanks (voices)",animated film,https://en.wikipedia.org/wiki/Toy_Story,In a world where toys are living things who pr...,"(0, 45892)\t0.02466060175857681\n (0, 45660..."


In [51]:
vector_index = 4066
vector_to_compare = updated_df['vector'].iloc[vector_index]
df_copy = updated_df.copy()
similarity_dataframe = get_top_n_similar_texts(10, vector_to_compare, df_copy)

In [52]:
updated_list_of_plots[4066]

"In a world where toys are living things who pretend to be lifeless when humans are present, a group of toys, owned by six-year-old are caught off-guard when birthday party is moved up a week, as his mother, and infant sister are preparing to move the following week. The toys' leader and favorite toy, a pull-string cowboy doll named organizes the other toys, including the shepherdess, the the and into a scouting mission. army men, led by spy on the party, and report the results to the others via baby monitors. The toys are relieved when the party appears to end with none of them having been replaced, but then receives a surprise gift – an electronic toy space ranger action figure named who thinks he is an actual space ranger. impresses the other toys with his various features, and begins to favor him, making feel left out. As prepares for a family outing at his mother allows him to bring one toy. Fearing will choose attempts to trap behind a desk, but ends up accidentally knocking him 

In [53]:
similarity_dataframe

Unnamed: 0,Release_Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki_Page,Plot,vector,similarity
4066,1995,Toy Story,American,John Lasseter,"Tim Allen, Tom Hanks (voices)",animated film,https://en.wikipedia.org/wiki/Toy_Story,In a world where toys are living things who pr...,"(0, 45892)\t0.02466060175857681\n (0, 45660...",[[1.0000000000000007]]
7435,2010,Toy Story 3,American,Lee Unkrich,"Tom Hanks, Tim Allen, Joan Cusack, Ned Beatty,...",family,https://en.wikipedia.org/wiki/Toy_Story_3,Seventeen-year-old Andy[11] is about to leave ...,"(0, 45901)\t0.014525262535757874\n (0, 4589...",[[0.5836859967703293]]
5061,1999,Toy Story 2,American,"John Lasseter, Lee Unkrich, Ash Brannon","voices of Tom Hanks, Tim Allen, Annie Potts, D...",animation comedy family,https://en.wikipedia.org/wiki/Toy_Story_2,"Andy prepares to go to cowboy camp with Woody,...","(0, 45865)\t0.04570316590367222\n (0, 45654...",[[0.5213547825596632]]
3404,1992,Toys,American,Barry Levinson,"Robin Williams, Joan Cusack, LL Cool J, Robin ...",fantasy,https://en.wikipedia.org/wiki/Toys_(1992_film),"Kenneth Zevo, the owner of the Zevo Toys facto...","(0, 45982)\t0.021335756140723288\n (0, 4561...",[[0.3587091619609773]]
3382,1992,"Silent Night, Deadly Night 5: The Toy Maker",American,Martin Kitrosser,Mickey Rooney,horror,"https://en.wikipedia.org/wiki/Silent_Night,_De...","Late one night in December, a young boy named ...","(0, 45982)\t0.020222505181765518\n (0, 4590...",[[0.3550642088768303]]
877,1977,The Mouse and His Child,American,Chris Swenson,"Voices of Cloris Leachman, Peter Ustinov, Andy...","animated, family",https://en.wikipedia.org/wiki/The_Mouse_and_Hi...,The mouse and his child are two parts of a sin...,"(0, 45685)\t0.08791545673340631\n (0, 45361...",[[0.3463398244454713]]
1795,1984,Where the Toys Come From,American,Theodore Thomas,,animation,https://en.wikipedia.org/wiki/Where_the_Toys_C...,"It follows the journey of two toys, named Zoom...","(0, 42461)\t0.1290986204289452\n (0, 42318)...",[[0.315318089742874]]
4849,1998,Small Soldiers,American,Joe Dante,"Gregory Smith, Kirsten Dunst, Frank Langella, ...","science fiction, fantasy, action",https://en.wikipedia.org/wiki/Small_Soldiers,Top defense contractor GloboTech Industries ac...,"(0, 45793)\t0.09470121509315362\n (0, 45688...",[[0.30474144405920883]]
2577,1988,Tin Toy,American,John Lasseter,,animated,https://en.wikipedia.org/wiki/Tin_Toy,The film takes place in one room and stars the...,"(0, 44691)\t0.09786724906561985\n (0, 44684...",[[0.28649697107258904]]
2187,1987,Dolls,American,Stuart Gordon,,horror,https://en.wikipedia.org/wiki/Dolls_(1987_film),A violent thunderstorm strands six people in t...,"(0, 45872)\t0.05065555939875212\n (0, 45711...",[[0.25469348252853824]]


In [None]:
vector_index = 89
vector_to_test = updated_df['vector'].iloc[vector_index]
max_cosine = 0.0
new_index = 0

for index, row in updated_df.iterrows():
    similarity = compare_texts(row['vector'], vector_to_test)
    if index == vector_index:
        continue
    if similarity > max_cosine:
        max_cosine = similarity
        new_index = index

In [None]:
new_index

In [None]:
updated_df.iloc[new_index]

In [None]:
# custom vector
custom_plot = "Child is left home alone during Christmas. Thieves break into his house as his parents are not home and he has to defend himself."
vector_custom = vectoriser.transform([custom_plot])

In [None]:
max_cosine = 0.0
new_index = 0

for index, row in updated_df.iterrows():
    similarity = compare_texts(row['vector'], vector_custom)
    
    if similarity > max_cosine:
        max_cosine = similarity
        new_index = index

In [None]:
max_cosine

In [None]:
updated_df.iloc[new_index]

In [None]:
# 2. vectorise each text using tfidf and attach list of vectorised texts to dataframe
# 3. input a given string (own plot) and vectorise using tfidf
# 3. use a for loop to go through each text and get cosine similarity of given string to each text in the dataframe.
# Return the most similar plot and its movie details