# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# Load the dataset

In [None]:
# change the path of the dataset
data = pd.read_csv('archive/wiki_movie_plots_deduped.csv')
data.head(5)

In [None]:
data.tail(5)

In [None]:
data.shape

# Exploratory Data Analysis

In [None]:
data.isnull().sum()

In [None]:
data['Genre'].nunique()

In [None]:
data['Origin/Ethnicity'].nunique()

In [None]:
data['Release Year'].nunique()

In [None]:
data.duplicated()

## Distribution of movies over the years

In [None]:
import matplotlib.pyplot as plt

movies_per_year = data['Release Year'].value_counts().sort_index().reset_index()
movies_per_year.columns = ['Release Year' , 'Count']

sns.set_style('whitegrid')
plt.figure(figsize=(13, 5))

sns.lineplot(x='Release Year', y='Count', data=movies_per_year)

plt.title('Distribution of the movies over the years', fontsize=25)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Number of movies', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

## Movies origin

In [None]:
movies_per_origin = data['Origin/Ethnicity'].value_counts().sort_index().reset_index()
movies_per_origin.columns = ['Origin', 'Count']


plt.figure(figsize=(16, 9))


sns.barplot(x='Origin', y='Count', data=movies_per_origin)

plt.title('Movies Origin', fontsize=25)
plt.xlabel('Origin', fontsize=20)
plt.ylabel('Movies Origin', fontsize=20)
plt.xticks(fontsize=15, rotation=45, ha='right')
plt.yticks(fontsize=15)

plt.show()


# Data Pre-processing

In [None]:
# Import
import spacy
import string
import gensim
import operator
import re


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
def text_cleaner(text):
    
    # Remove any characters that are not uppercase letters, lowercase letters, or white space character.
    cleaned_text = re.sub(r'[^A-Za-z\s]', '', text) 
    
    # Replace conecutive spaces with a single space.
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text   

In [None]:
# Create a list of stopwords
stop_words = set(stopwords.words('english'))

# Initialize limitizers
lemmatizer = WordNetLemmatizer()

In [None]:
def nltk_tokenizer(text):
    
    # Remove any characters that are not uppercase letters, lowercase letters, or white space character.
    cleaned_text = re.sub(r'[^A-Za-z\s]', '', text) 
    
    # Replace conecutive spaces with a single space.
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    # Creating token objects      
    tokens = word_tokenize(cleaned_text)
    
    
    lowercase_tokens = [token.lower() for token in tokens]
    
    # remove stop words
    filtered_tokens = [token for token in lowercase_tokens if token not in stop_words]
    
    # limitize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return lemmatized_tokens
    

In [None]:
print("Cleaning and Tokenizing...")

%time data['plot_tokenized'] = data['Plot'].map(lambda x : nltk_tokenizer(x))

data.head(5)


In [None]:
# store tokens separatly 
movie_tokenized = data['plot_tokenized']

# Building Word Dictionary

In [None]:
from gensim import corpora

# Build a dictionary for the tokenizd words
%time dictionary = corpora.Dictionary(movie_tokenized)

In [None]:
# Creating a list of lists
dict_tokens = [
    [
        [dictionary[key]
         , dictionary.token2id[dictionary[key]]]
        for key, value in dictionary.items()
        if key <= 50
    ]
]
# Printing the resulting list
print(dict_tokens)


# Bag of Words

In [None]:
%time corpus = [dictionary.doc2bow(desc) for desc in movie_tokenized] # Build bag of words for the tokens

In [None]:
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

print(word_frequencies)

# Tf-Idf and LSI model

In [None]:
movie_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)

movie_lsi_model = gensim.models.LsiModel(movie_tfidf_model[corpus], id2word=dictionary, num_topics=400)



In [None]:
# Serialize the output of the model

gensim.corpora.MmCorpus.serialize('movie_tfidf_model_mm', movie_tfidf_model[corpus])

gensim.corpora.MmCorpus.serialize('movie_lsi_model_mm',movie_lsi_model[movie_tfidf_model[corpus]])

In [None]:
# Load the previously serialized models back to memory.
# This allows you to use the preprocessed without having to remcompute it again.


movie_tfidf_corpus = gensim.corpora.MmCorpus('movie_tfidf_model_mm')
movie_lsi_corpus = gensim.corpora.MmCorpus('movie_lsi_model_mm')


In [None]:
from gensim.similarities import MatrixSimilarity


movie_index = MatrixSimilarity(movie_lsi_corpus, num_features=movie_lsi_corpus.num_terms)


# Search 

In [None]:
from operator import itemgetter


def search(input_query):
    
    tokenized_input = nltk_tokenizer(input_query)
    bow_input = dictionary.doc2bow(tokenized_input)
    
    query_tfidf = movie_tfidf_model[bow_input]
    query_lsi = movie_lsi_model[query_tfidf]
    
    movie_index.num_best = 10
    
    movies_list = movie_index[query_lsi]
    
    
    movies_list.sort(key=itemgetter(1), reverse=True)
    movie_names = []
    
    for j, movie in enumerate(movies_list):

        movie_names.append (
            {
                'Relevance': round((movie[1] * 100),2),
                'Movie Title': data['Title'][movie[0]],
                'Movie Plot': data['Plot'][movie[0]],
                'Wikipedia Link' : data['Wiki Page'][movie[0]]
            }

        )
        if j == (movie_index.num_best-1):
            break

    return pd.DataFrame(movie_names, columns=['Relevance','Movie Title','Movie Plot', 'Wikipedia Link'])

In [None]:
search('basketball')