# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# Load the dataset

In [2]:
# change the path of the dataset
data = pd.read_csv('archive/wiki_movie_plots_deduped.csv')

# Data Pre-processing

In [3]:
# Import
import spacy
import string
import gensim
import operator
import re


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
def text_cleaner(text):
    
    # Remove any characters that are not uppercase letters, lowercase letters, or white space character.
    cleaned_text = re.sub(r'[^A-Za-z\s]', '', text) 
    
    # Replace conecutive spaces with a single space.
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text   

In [5]:
# Create a list of stopwords
stop_words = set(stopwords.words('english'))

# Initialize limitizers
lemmatizer = WordNetLemmatizer()

In [6]:
def text_tokenizer(text):
    
    # Remove any characters that are not uppercase letters, lowercase letters, or white space character.
    cleaned_text = re.sub(r'[^A-Za-z\s]', '', text) 
    
    # Replace conecutive spaces with a single space.
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    # Creating token objects      
    tokens = word_tokenize(cleaned_text)
    
    
    lowercase_tokens = [token.lower() for token in tokens]
    
    # remove stop words
    filtered_tokens = [token for token in lowercase_tokens if token not in stop_words]
    
    # limitize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return lemmatized_tokens
    

In [7]:
print("Cleaning and Tokenizing...")

%time data['plot_tokenized'] = data['Plot'].map(lambda x : text_tokenizer(x))

data.head(5)


Cleaning and Tokenizing...
CPU times: total: 41.5 s
Wall time: 1min 17s


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,plot_tokenized
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","[bartender, working, saloon, serving, drink, c..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","[moon, painted, smiling, face, hang, park, nig..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","[film, minute, long, composed, two, shot, firs..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"[lasting, second, consisting, two, shot, first..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,"[earliest, known, adaptation, classic, fairyta..."


In [8]:
# store tokens separatly 
movie_tokenized = data['plot_tokenized']

In [9]:
# Save the dataframe
data.to_csv('processed_movie_plot_data.csv', index=False)

# Building Word Dictionary

In [10]:
from gensim import corpora

# Build a dictionary for the tokenizd words
%time dictionary = corpora.Dictionary(movie_tokenized)

CPU times: total: 4.58 s
Wall time: 9.16 s


In [11]:
# Save the dictionary
dictionary.save('movie_dictionary')

In [12]:
# Creating a list of lists
dict_tokens = [
    [
        [dictionary[key]
         , dictionary.token2id[dictionary[key]]]
        for key, value in dictionary.items()
        if key <= 50
    ]
]
# Printing the resulting list
print(dict_tokens)


[[['appear', 0], ['assault', 1], ['bar', 2], ['bartender', 3], ['beer', 4], ['begin', 5], ['breaking', 6], ['bucket', 7], ['burst', 8], ['carrie', 9], ['cash', 10], ['customer', 11], ['drink', 12], ['dumping', 13], ['everybody', 14], ['eye', 15], ['face', 16], ['fill', 17], ['fixture', 18], ['follower', 19], ['group', 20], ['hat', 21], ['head', 22], ['inside', 23], ['irish', 24], ['leave', 25], ['man', 26], ['mirror', 27], ['nation', 28], ['order', 29], ['policeman', 30], ['pulling', 31], ['register', 32], ['saloon', 33], ['seltzer', 34], ['serving', 35], ['smashing', 36], ['spray', 37], ['stereotypically', 38], ['water', 39], ['working', 40], ['wrecking', 41], ['bench', 42], ['better', 43], ['bigger', 44], ['blocked', 45], ['causing', 46], ['couple', 47], ['embrace', 48], ['everything', 49], ['fan', 50]]]


# Bag of Words

In [13]:
%time corpus = [dictionary.doc2bow(desc) for desc in movie_tokenized] # Build bag of words for the tokens

CPU times: total: 3.72 s
Wall time: 7.2 s


In [14]:
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

print(word_frequencies)

[[('appear', 1), ('assault', 1), ('bar', 1), ('bartender', 2), ('beer', 2), ('begin', 1), ('breaking', 1), ('bucket', 1), ('burst', 1), ('carrie', 1), ('cash', 1), ('customer', 1), ('drink', 1), ('dumping', 1), ('everybody', 1), ('eye', 1), ('face', 1), ('fill', 1), ('fixture', 1), ('follower', 1), ('group', 2), ('hat', 1), ('head', 1), ('inside', 1), ('irish', 2), ('leave', 1), ('man', 2), ('mirror', 1), ('nation', 2), ('order', 1), ('policeman', 1), ('pulling', 1), ('register', 1), ('saloon', 1), ('seltzer', 1), ('serving', 1), ('smashing', 1), ('spray', 1), ('stereotypically', 1), ('water', 1), ('working', 1), ('wrecking', 1)], [('face', 1), ('hat', 1), ('man', 1), ('bench', 1), ('better', 1), ('bigger', 1), ('blocked', 1), ('causing', 1), ('couple', 1), ('embrace', 1), ('everything', 1), ('fan', 1), ('fence', 1), ('frown', 1), ('get', 1), ('hang', 1), ('last', 1), ('learn', 1), ('left', 1), ('look', 1), ('moon', 5), ('night', 1), ('painted', 1), ('park', 1), ('past', 1), ('perched'

# Tf-Idf and LSI model

In [15]:
movie_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)

movie_lsi_model = gensim.models.LsiModel(movie_tfidf_model[corpus], id2word=dictionary, num_topics=400)



In [16]:
# Save the models
movie_tfidf_model.save('movie_tfidf_model')
movie_lsi_model.save('movie_lsi_model')

In [17]:
# Serialize the output of the model

gensim.corpora.MmCorpus.serialize('movie_tfidf_model_mm', movie_tfidf_model[corpus])

gensim.corpora.MmCorpus.serialize('movie_lsi_model_mm',movie_lsi_model[movie_tfidf_model[corpus]])

In [18]:
# Load the previously serialized models back to memory.
# This allows you to use the preprocessed without having to remcompute it again.


movie_tfidf_corpus = gensim.corpora.MmCorpus('movie_tfidf_model_mm')
movie_lsi_corpus = gensim.corpora.MmCorpus('movie_lsi_model_mm')


In [19]:
from gensim.similarities import MatrixSimilarity

"""
MatrixSimilarity: creates similarity index. This index allows efficient computation 
of cosine similarity between vectors in the LSI space.
Each row in the index matrix corresponds to a document (vector in the LSI space).
The entries of this matix are the norms of the documents.
"""

movie_index = MatrixSimilarity(movie_lsi_corpus, num_features=movie_lsi_corpus.num_terms)


In [20]:
# Save similarity index
movie_index.save('movie_index')

# Search 

In [21]:
from operator import itemgetter


def search(input_query):
    
    tokenized_input = text_tokenizer(input_query)
    bow_input = dictionary.doc2bow(tokenized_input)
    
    query_tfidf = movie_tfidf_model[bow_input]
    query_lsi = movie_lsi_model[query_tfidf]
    
    movie_index.num_best = 10
    
    movies_list = movie_index[query_lsi]
    
    
    movies_list.sort(key=itemgetter(1), reverse=True)
    movie_names = []
    
    for j, movie in enumerate(movies_list):

        movie_names.append (
            {
                'Relevance': round((movie[1] * 100),2),
                'Movie Title': data['Title'][movie[0]],
                'Movie Plot': data['Plot'][movie[0]],
                'Wikipedia Link' : data['Wiki Page'][movie[0]]
            }

        )
        if j == (movie_index.num_best-1):
            break

    return pd.DataFrame(movie_names, columns=['Relevance','Movie Title','Movie Plot', 'Wikipedia Link'])

In [22]:
%time search('basketball')

CPU times: total: 125 ms
Wall time: 194 ms


Unnamed: 0,Relevance,Movie Title,Movie Plot,Wikipedia Link
0,74.4,Sunset Park,Phyllis Saroka (Perlman) is a P.E. teacher at ...,https://en.wikipedia.org/wiki/Sunset_Park_(film)
1,73.32,Glory Road,Newly appointed men's basketball coach Don Has...,https://en.wikipedia.org/wiki/Glory_Road_(film)
2,71.81,The Comebacks,Coach Lambeau Fields (David Koechner) is pathe...,https://en.wikipedia.org/wiki/The_Comebacks
3,71.54,Slam Dunk: Shohoku's Greatest Challenge!,Hanamichi Sakuragi is a delinquent and the lea...,https://en.wikipedia.org/wiki/Slam_Dunk_(manga)
4,71.54,Slam Dunk: Howling Basketman Spirit!!,Hanamichi Sakuragi is a delinquent and the lea...,https://en.wikipedia.org/wiki/Slam_Dunk_(manga)
5,70.22,Rackety Rax,"Always looking for an angle, ""Knucks"" McGloin ...",https://en.wikipedia.org/wiki/Rackety_Rax
6,70.1,Inazuma Eleven: Saikyō Gundan Ōgre Shūrai,Endou Mamoru is a cheerful goalkeeper in Raimo...,https://en.wikipedia.org/wiki/Inazuma_Eleven_(...
7,69.88,The Smart Set,A self-centered polo player (Haines) has to re...,https://en.wikipedia.org/wiki/The_Smart_Set_(1...
8,69.58,Going Vertical,1970 year. The Soviet national basketball team...,https://en.wikipedia.org/wiki/Going_Vertical
9,69.19,Split,A man once considered a bowling legend teams u...,https://en.wikipedia.org/wiki/Split_(2016_Sout...
