<a href="https://colab.research.google.com/github/anthonybrown0528/csc-442-course-project/blob/main/notebook/vectorize_film_descriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [222]:
# Import pandas to access the dataset
import pandas as pd

# Import a string vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Dataset

In [223]:
dataset_path = 'https://raw.githubusercontent.com/anthonybrown0528/csc-442-course-project/refs/heads/main/dataset/clean/netflix_film_imdb_data.csv'
netflix_film_imdb_scores_df = pd.read_csv(dataset_path)

# Address text encoding errors

There are some meaningless characters in the film descriptions due to errors when encoding and decoding text from bytes. The most common errors can be identfied and corrected by mapping the meaningless errors to what they are expected to encode.

Source: https://www.i18nqa.com/debug/utf8-debug.html

In [224]:
encoding_mapping = {
    u'â€“': "—", # Use prefix to store unicode string. Source: https://docs.python.org/2/tutorial/introduction.html#unicode-strings
    u'â€œ': '"',
    u'â€ 	': '"',
    u'ãƒ™ã‚¤ãƒ–ãƒ¬ãƒ¼ãƒ‰ãƒãƒ¼ã‚¹ãƒˆGT(ã‚¬ãƒ': '', # non-latin characters (removed)
    u'à¤†à¤µà¤¾à¤°à¤¾ à¤ªà¤¾à¤—à¤² à¤¦à¥€à¤µà¤¾à¤¨à¤¾': '', # non-latin characters (removed)
    u'Ã©': 'é',
    u'Ã³': 'ó',
    u'â€™': "'"
}

def map_encoding(description):
  for key in encoding_mapping:
    prev_description = description
    description = description.replace(key, encoding_mapping[key])

  return description

netflix_film_imdb_scores_df['description_x'] = netflix_film_imdb_scores_df['description_x'].apply(map_encoding)
netflix_film_imdb_scores_df['description_y'] = netflix_film_imdb_scores_df['description_y'].apply(map_encoding)

# Perform Lemmatization

In [225]:
# Source: https://www.nltk.org/api/nltk.stem.WordNetLemmatizer.html?highlight=wordnet
# Source: https://www.nltk.org/api/nltk.tokenize.sent_tokenize.html
# Source: https://www.nltk.org/book/ch05.html
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [226]:
# Source: https://cs.nyu.edu/~grishman/jet/guide/PennPOS.htm

lemmatizer = WordNetLemmatizer()

pos_mapping = {
    'NOUN': 'n',
    'ADV': 'a',
    'NUM': 'n',
    'PRON': 'n',
    'ADJ': 'a',
    'VERB': 'v',
    'PRT': 'n',
    'X': 'n',
    'ADP': 'n',
    'CONJ': 'n'
}

pos_ignore = {
  'DET',
  '.'
}

def lemmatize_description(description):
  tokens = word_tokenize(description)
  tokens = nltk.pos_tag(tokens, tagset='universal')

  lemma_sequence = []
  for token, pos in tokens:
    try:
      lemma = lemmatizer.lemmatize(token, pos=pos_mapping[pos])
    except:
      # Handle cases when the part of speech is not recognized
      if pos in pos_ignore or pos == token:
        # skip tokens that may not contribute to the meaning of the text
        continue

      # otherwise add the token without transformation
      lemma = token
    lemma_sequence.append(lemma)
  return ' '.join(lemma_sequence)

netflix_film_imdb_scores_df['description_x'] = netflix_film_imdb_scores_df['description_x'].apply(lemmatize_description)
netflix_film_imdb_scores_df['description_y'] = netflix_film_imdb_scores_df['description_y'].apply(lemmatize_description)

# Transform both descriptions with Tfidf Vectorizer

In [227]:
def vectorize_description(df, description_column, VectorizerType):
    imdb_id_df = df[['imdb_id', 'title', 'release_year']]
    vectorizer = VectorizerType(stop_words='english')

    description = df[description_column]
    term_document_matrix = vectorizer.fit_transform(description)

    term_document_df = pd.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    term_document_df = pd.merge(imdb_id_df, term_document_df, left_index=True, right_index=True)
    return term_document_df, (term_document_matrix, vectorizer.get_feature_names_out())

In [228]:
descriptions_df = netflix_film_imdb_scores_df[['imdb_id', 'title', 'release_year', 'description_x', 'description_y']]
descriptions_df.head()

Unnamed: 0,imdb_id,title,release_year,description_x,description_y
0,tt0071853,Monty Python and the Holy Grail,1975,King Arthur accompany by his squire recruit hi...,Monty Python comedy clan skewer King Arthur an...
1,tt0058385,My Fair Lady,1964,snobbish phonetics professor agree to wager th...,When Cockney flower girl take elocution lesson...
2,tt0080453,The Blue Lagoon,1980,Two small child and ship 's cook survive shipw...,Two shipwrecked child strand for year on deser...
3,tt0061418,Bonnie and Clyde,1967,In 1930s bore waitress Bonnie Parker fall in l...,Bonnie Parker and Clyde Barrow be young in lov...
4,tt0054953,The Guns of Navarone,1961,team of allied saboteur be assign impossible m...,During World War II British force launch attac...


In [229]:
term_document_df_tfidf_x, vectorization_tfidf_x = vectorize_description(descriptions_df, 'description_x', TfidfVectorizer)
term_document_df_tfidf_x

Unnamed: 0,imdb_id,title_x,release_year,00,000,007,05,10,100,1000,...,zor,zorro,zoya,zukijou,zuo,ã¼ã,åÿan,åÿmaya,åžehnaz,ón
0,tt0071853,Monty Python and the Holy Grail,1975,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0058385,My Fair Lady,1964,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt0080453,The Blue Lagoon,1980,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0061418,Bonnie and Clyde,1967,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0054953,The Guns of Navarone,1961,0.0,0.18453,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2473,tt13657102,The Tambour of Retribution,2021,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2474,tt13879000,Pitta Kathalu,2021,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [230]:
term_document_df_tfidf_y, _ = vectorize_description(descriptions_df, 'description_y', TfidfVectorizer)
term_document_df_tfidf_y

Unnamed: 0,imdb_id,title_x,release_year,000,007,10,100,1000,11,12,...,zone,zoo,zoom,zorro,zozo,zuckerberg,zurich,álex,ángel,über
0,tt0071853,Monty Python and the Holy Grail,1975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0058385,My Fair Lady,1964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt0080453,The Blue Lagoon,1980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0061418,Bonnie and Clyde,1967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0054953,The Guns of Navarone,1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2473,tt13657102,The Tambour of Retribution,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2474,tt13879000,Pitta Kathalu,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [231]:
term_document_df_count_x, _ = vectorize_description(descriptions_df, 'description_x', CountVectorizer)
term_document_df_count_x

Unnamed: 0,imdb_id,title_x,release_year,00,000,007,05,10,100,1000,...,zor,zorro,zoya,zukijou,zuo,ã¼ã,åÿan,åÿmaya,åžehnaz,ón
0,tt0071853,Monty Python and the Holy Grail,1975,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt0058385,My Fair Lady,1964,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0080453,The Blue Lagoon,1980,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt0061418,Bonnie and Clyde,1967,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tt0054953,The Guns of Navarone,1961,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2473,tt13657102,The Tambour of Retribution,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2474,tt13879000,Pitta Kathalu,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [232]:
term_document_df_count_y, _ = vectorize_description(descriptions_df, 'description_y', CountVectorizer)
term_document_df_count_y

Unnamed: 0,imdb_id,title_x,release_year,000,007,10,100,1000,11,12,...,zone,zoo,zoom,zorro,zozo,zuckerberg,zurich,álex,ángel,über
0,tt0071853,Monty Python and the Holy Grail,1975,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt0058385,My Fair Lady,1964,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0080453,The Blue Lagoon,1980,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt0061418,Bonnie and Clyde,1967,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tt0054953,The Guns of Navarone,1961,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2473,tt13657102,The Tambour of Retribution,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2474,tt13879000,Pitta Kathalu,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Apply Latent Dirichlet Allocation

In [233]:
from sklearn.decomposition import LatentDirichletAllocation

In [234]:
term_document_matrix_tfidf_x, feature_names_matrix_tfidf_x = vectorization_tfidf_x

model = LatentDirichletAllocation()
model.fit(term_document_matrix_tfidf_x)

In [235]:
n_words = 20

components = model.components_

topic_words_df = pd.DataFrame(components, columns=feature_names_matrix_tfidf_x)
topic_words_df = topic_words_df.T

topic_words_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
00,0.100000,0.990882,0.400267,0.100000,0.100000,0.100035,0.100025,0.100000,0.100000,0.100005
000,0.100000,0.500966,0.653562,0.813745,0.358991,0.327593,0.439907,0.214633,0.539857,0.908792
007,0.100000,0.100000,0.100000,0.305048,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
05,0.100000,0.100173,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.276721
10,0.100019,0.762223,0.566746,0.243149,0.396577,0.667480,0.100000,0.545961,0.232715,0.813164
...,...,...,...,...,...,...,...,...,...,...
ã¼ã,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.251171,0.100000
åÿan,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.175586,0.100000
åÿmaya,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.175586,0.100000
åžehnaz,0.366600,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000


In [236]:
sorted_df_first_topic = topic_words_df.sort_values(by=0, ascending=False)
sorted_df_first_topic.head(n_words)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
life,5.127904,5.003997,7.05991,3.961704,3.049912,5.789966,5.11873,4.215195,5.797246,4.280504
young,3.920551,4.084861,4.602609,3.235103,1.939933,4.268101,0.841201,2.666861,5.996399,3.51098
story,3.632642,2.682078,5.162476,2.19211,1.972613,2.61069,2.314734,3.449631,2.806245,1.847659
family,3.591785,4.331924,5.473326,2.311684,2.898498,2.576031,2.305512,3.379402,6.114221,3.628203
daughter,3.331688,1.237827,2.083498,0.607377,1.573979,1.447412,1.222906,1.602051,2.795111,1.603282
man,3.072456,2.464649,5.034958,3.283421,2.095631,3.148704,2.367849,3.364029,4.02562,1.706828
woman,2.970251,3.118777,4.999812,2.460567,1.424791,3.002156,1.165461,4.31113,4.199332,2.073024
love,2.927886,4.781896,6.713095,2.649137,2.880177,4.24576,2.470979,3.737003,4.42435,2.627965
year,2.719191,3.947897,4.164749,3.510392,2.031369,2.876078,1.755232,3.468103,4.0247,3.286016
time,2.517678,2.054443,2.179383,1.297113,1.235447,2.33846,2.198238,1.472157,2.657641,1.587668


This output suggests that a common theme among the films in the dataset is family, life changes, and interpersonal relationships.