## Generate models for the search function

Dataset from 

https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/data

Code based on

https://www.kaggle.com/code/ajitrajput/semantic-search-engine-using-nlp/notebook

In [1]:
import numpy as np
import pandas as pd
import spacy
import string
import gensim
import operator
import re

In [2]:
df_movies = pd.read_csv('../raw_data/wiki_movie_plots_deduped.csv')
df_movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [3]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [4]:
am_movies = df_movies[df_movies['Origin/Ethnicity'] == 'American']
am_movies = am_movies.reset_index(drop=True)

In [5]:
#NaNs in am_movies
am_movies.isnull().sum()

Release Year          0
Title                 0
Origin/Ethnicity      0
Director              0
Cast                317
Genre                 0
Wiki Page             0
Plot                  0
dtype: int64

In [6]:
print(f"Number of American movies: {len(am_movies)}")
print(f"Number of all movies: {len(df_movies)}")

Number of American movies: 17377
Number of all movies: 34886


In [7]:
#How many genres in American movies?
am_movies['Genre'].nunique()

771

In [8]:
American_genres = am_movies['Genre'].unique()
American_genres

array(['unknown', 'western', 'comedy', 'short',
       'short action/crime western', 'short film', 'biographical',
       'drama', 'adventure', 'short fantasy', 'silent sports', 'horror',
       'crime', 'drama, horror', 'historical drama', 'fantasy drama',
       'biographical drama', 'documentary drama', 'fantasy',
       'adventure serial', 'epic', 'historical', 'comedy short',
       'comedy, western', 'biography', 'action adventure',
       'western drama', 'short comedy', 'comedy–drama', 'romantic drama',
       'mystery', 'crime drama', 'romance',
       'sexual hygiene/exploitation film', 'comedy drama', 'war drama',
       'spy', 'romantic comedy', 'propaganda', 'ww1 propaganda', 'biopic',
       'animated series', 'drama romance', 'melodrama', 'period drama',
       'swashbuckler', 'romance drama', 'drama, adventure',
       'crime comedy', 'documentary', 'comedy western', 'fantasy, family',
       'war', 'comedy, adventure', 'fantasy, adventure', 'thriller',
       'dramatic

### Tokenize

In [None]:
# Install the spacy model
!python3 -m spacy download en_core_web_sm

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def tokenizer(sentence):
    sentence = re.sub('\'','',sentence)  # Removes single quotes from the sentence.
    sentence = re.sub('\w*\d\w*','',sentence)  # Removes words containing digits.
    sentence = re.sub(' +',' ',sentence)  # Replaces multiple spaces with a single space.
    sentence = re.sub(r'\n: \'\'.*','',sentence)  # Removes lines starting with ": ''".
    sentence = re.sub(r'\n!.*','',sentence)  # Removes lines starting with "!".
    sentence = re.sub(r'^:\'\'.*','',sentence)  # Removes lines starting with ":'".
    sentence = re.sub(r'\n',' ',sentence)  # Replaces newline characters with spaces.
    sentence = re.sub(r'[^\w\s]',' ',sentence)  # Removes all non-alphanumeric characters except spaces.
    tokens = spacy_nlp(sentence)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    return tokens

In [11]:
#Tokenizing (17 minutes)

am_movies['plot_tokenized'] = am_movies['Plot'].map(lambda x: tokenizer(x))

am_movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,plot_tokenized
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","[bartender, work, saloon, serve, drink, custom..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","[moon, paint, smile, face, hang, park, night, ..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","[film, minute, long, compose, shot, girl, sit,..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"[second, consist, shot, shot, set, wood, winte..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,"[early, know, adaptation, classic, fairytale, ..."


### Save the tokenized data

In [13]:
# Save the DataFrame as a CSV file in the raw_data folder
am_movies.to_csv('../raw_data/am_movies_with_tokenized_plots.csv', index=False)

In [14]:
# Load the tokenized DataFrame
df_movies = pd.read_csv('../raw_data/am_movies_with_tokenized_plots.csv')
df_movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,plot_tokenized
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","['bartender', 'work', 'saloon', 'serve', 'drin..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","['moon', 'paint', 'smile', 'face', 'hang', 'pa..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","['film', 'minute', 'long', 'compose', 'shot', ..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"['second', 'consist', 'shot', 'shot', 'set', '..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,"['early', 'know', 'adaptation', 'classic', 'fa..."


### Save a small dataframe to use the titles for the search function

In [None]:
am_movies['Release Year'] = am_movies['Release Year'].astype('int16')  # Example for integers
am_titles = am_movies.drop(columns=['Plot', 'plot_tokenized'])
# Save the DataFrame as a Feather file
am_titles.to_feather('../model/am_titles.feather') # 1.4 MB

In [None]:
# Load the DataFrame from the Feather file
am_titles = pd.read_feather('../model/am_titles.feather')
am_titles.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...


### Save the model

In [15]:
am_movie_plot = am_movies['plot_tokenized']
am_movie_plot[0:5]

0    [bartender, work, saloon, serve, drink, custom...
1    [moon, paint, smile, face, hang, park, night, ...
2    [film, minute, long, compose, shot, girl, sit,...
3    [second, consist, shot, shot, set, wood, winte...
4    [early, know, adaptation, classic, fairytale, ...
Name: plot_tokenized, dtype: object

In [16]:
from gensim import corpora

am_dictionary = corpora.Dictionary(am_movie_plot)

#note: "movie", "film"
stoplist = set('hello and if this can would should could tell ask stop come go movie film'.split())
stop_ids = [am_dictionary.token2id[stopword] for stopword in stoplist if stopword in am_dictionary.token2id]
am_dictionary.filter_tokens(stop_ids)

In [18]:
from pathlib import Path

# Create ../model folder if it doesn't exist
Path('../model/').mkdir(parents=True, exist_ok=True)

In [19]:
# Save the dictionary to the ../model folder
am_dictionary.save('../model/am_dictionary.dict')

In [None]:
# File paths
am_dictionary_path = '../model/am_dictionary.dict'
# Calculate file sizes
am_dictionary_size = os.path.getsize(am_dictionary_path) / (1024 * 1024)      # Convert bytes to MB
# Print file sizes
print(f"Size of am_dictionary: {am_dictionary_size:.2f} MB")

Size of am_dictionary: 2.02 MB


In [21]:
from gensim import corpora

# Load the dictionary from the ../model folder
am_dictionary = corpora.Dictionary.load('../model/am_dictionary.dict')

In [22]:
am_corpus = [am_dictionary.doc2bow(desc) for desc in am_movie_plot]

In [23]:
import gensim
from gensim.models.coherencemodel import CoherenceModel

#Create a TF-IDF model

movie_tfidf_am_model = gensim.models.TfidfModel(am_corpus, id2word=am_dictionary)

In [24]:
# Save the TF-IDF model to the ../model folder
movie_tfidf_am_model.save('../model/movie_tfidf_am_model')

In [25]:
import os

# File paths
tfidf_am_model_path = '../model/movie_tfidf_am_model'
# Calculate file sizes
tfidf_am_model_size = os.path.getsize(tfidf_am_model_path) / (1024 * 1024)      # Convert bytes to MB
# Print file sizes
print(f"Size of movie_tfidf_am_model: {tfidf_am_model_size:.2f} MB")

Size of movie_tfidf_am_model: 4.78 MB


In [26]:
from gensim import models

# Load the TF-IDF model from the ../model folder
movie_tfidf_am_model = models.TfidfModel.load('../model/movie_tfidf_am_model')

In [27]:
# Create the LSI model with 2000 topics (takes 2 min)
movie_lsi_am_model = gensim.models.LsiModel(movie_tfidf_am_model[am_corpus], id2word=am_dictionary, num_topics=2000)

When you save an `LsiModel` using the `save()` method, Gensim splits the model's data into multiple files for efficiency and modularity. The `projection.u.npy` file is one of these files, and it specifically stores the U matrix.

Gensim creates several files in the `../models` directory:

`movie_lsi_am_model` (main metadata file).
`movie_lsi_am_model.projection.u.npy` (left singular vectors).
`movie_lsi_am_model.projection.s.npy` (singular values).
`movie_lsi_am_model.state` (model state information).

When you load the `LsiModel` using the `load()` method Gensim automatically reads the `projection.u.npy` file (along with other associated files) to reconstruct the model. You typically don’t need to interact with this file directly unless you are debugging or analyzing the internal workings of the LSI model.



In [28]:
# Save the LSI model
movie_lsi_am_model.save('../model/movie_lsi_am_model')

In [29]:
# File paths
lsi_am_model_path = '../models/movie_lsi_am_model'
lsi_am_model_projection_u_path = '../models/movie_lsi_am_model.projection.u.npy'
#lsi_am_model_projection_s_path = '../models/projection.s.npy'
#lsi_am_model_state_path = '../models/movie_lsi_am_model.state'

# Calculate file sizes
lsi_am_model_size = os.path.getsize(lsi_am_model_path) / (1024 * 1024)      # Convert bytes to MB
lsi_am_model_projection_u_size = os.path.getsize(lsi_am_model_projection_u_path) / (1024 * 1024)      # Convert bytes to MB
#lsi_am_model_projection_s_size = os.path.getsize(lsi_am_model_projection_s_path) / (1024 * 1024)      # Convert bytes to MB
#lsi_am_model_state_size = os.path.getsize(lsi_am_model_state_path) / (1024 * 1024)      # Convert bytes to MB

# Print file sizes
print(f"Size of movie_lsi_am_model: {lsi_am_model_size:.2f} MB")
print(f"Size of movie_lsi_am_model.projection.u: {lsi_am_model_projection_u_size:.2f} MB")
#print(f"Size of movie_lsi_am_model.projection.s: {lsi_am_model_projection_s_size:.2f} MB")
#print(f"Size of movie_lsi_am_model.state: {lsi_am_model_state_size:.2f} MB")

Size of movie_lsi_am_model: 2.70 MB
Size of movie_lsi_am_model.projection.u: 1302.49 MB


In [30]:
# Load the LSI model from the ../model folder
from gensim import models

movie_lsi_am_model = models.LsiModel.load('../model/movie_lsi_am_model')

In [31]:
# Transform the TF-IDF corpus into the LSI space
movie_lsi_am_corpus = movie_lsi_am_model[movie_tfidf_am_model[am_corpus]]

# Transform the original BoW corpus into the TF-IDF space
movie_tfidf_am_corpus = movie_tfidf_am_model[am_corpus]

In [None]:
# Serialize and Store the corpus locally
# (note the .index files created alongside the mm corpus files)
gensim.corpora.MmCorpus.serialize('../model/movie_tfidf_am_model_mm', movie_tfidf_am_model[am_corpus])
gensim.corpora.MmCorpus.serialize('../model/movie_lsi_am_model_mm',movie_lsi_am_model[movie_tfidf_am_model[am_corpus]])

In [33]:
# File paths
tfidf_model_path = '../model/movie_tfidf_am_model_mm'
lsi_model_path = '../model/movie_lsi_am_model_mm'

# Calculate file sizes
tfidf_model_size = os.path.getsize(tfidf_model_path) / (1024 * 1024)  # Convert bytes to MB
lsi_model_size = os.path.getsize(lsi_model_path) / (1024 * 1024)      # Convert bytes to MB

# Print file sizes
print(f"Size of movie_tfidf_am_model_mm: {tfidf_model_size:.2f} MB")
print(f"Size of movie_lsi_am_model_mm: {lsi_model_size:.2f} MB")

Size of movie_tfidf_am_model_mm: 70.96 MB
Size of movie_lsi_am_model_mm: 1039.94 MB
