<a href="https://colab.research.google.com/github/anthonybrown0528/csc-442-course-project/blob/main/notebook/vectorize_film_descriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import pandas to access the dataset
import pandas as pd

# Import a string vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
something = 'targetâ€”'
something.replace('â€”', 'osmet')

'targetosmet'

# Dataset

In [None]:
dataset_path = 'https://raw.githubusercontent.com/anthonybrown0528/csc-442-course-project/refs/heads/main/dataset/clean/netflix_film_imdb_data.csv'
netflix_film_imdb_scores_df = pd.read_csv(dataset_path)

# Address text encoding errors

In [None]:
encoding_mapping = {
    u'â€”': "—", # Use prefix to store unicode string. Source: https://docs.python.org/2/tutorial/introduction.html#unicode-strings
    u'â€œ': '“',
    u'ãƒ™ã‚¤ãƒ–ãƒ¬ãƒ¼ãƒ‰ãƒãƒ¼ã‚¹ãƒˆGT(ã‚¬ãƒ': '',
    u'à¤†à¤µà¤¾à¤°à¤¾ à¤ªà¤¾à¤—à¤² à¤¦à¥€à¤µà¤¾à¤¨à¤¾': '',
    u'Ã©': 'é',
    u'â€™': "'"
}

def map_encoding(description):
  for key in encoding_mapping:

    prev_description = description
    description = description.replace(key, encoding_mapping[key])

  return description


netflix_film_imdb_scores_df['description_x'] = netflix_film_imdb_scores_df['description_x'].apply(map_encoding)

# Transform both descriptions with Tfidf Vectorizer

In [None]:
def vectorize_description(df, description_column, VectorizerType):
    imdb_id_df = df[['imdb_id', 'title', 'release_year']]
    vectorizer = VectorizerType()

    description = df[description_column]
    term_document_matrix = vectorizer.fit_transform(description)
    print(vectorizer.get_feature_names_out())

    term_document_df = pd.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    term_document_df = pd.merge(imdb_id_df, term_document_df, left_index=True, right_index=True)
    return term_document_df

In [None]:
descriptions_df = netflix_film_imdb_scores_df[['imdb_id', 'title', 'release_year', 'description_x', 'description_y']]
descriptions_df.head()

Unnamed: 0,imdb_id,title,release_year,description_x,description_y
0,tt0071853,Monty Python and the Holy Grail,1975,"King Arthur, accompanied by his squire, recrui...",The Monty Python comedy clan skewers King Arth...
1,tt0058385,My Fair Lady,1964,A snobbish phonetics professor agrees to a wag...,When a Cockney flower girl takes elocution les...
2,tt0080453,The Blue Lagoon,1980,Two small children and a ship's cook survive a...,"Two shipwrecked children, stranded for years o..."
3,tt0061418,Bonnie and Clyde,1967,"In the 1930s, bored waitress Bonnie Parker fal...","Bonnie Parker and Clyde Barrow are young, in l..."
4,tt0054953,The Guns of Navarone,1961,A team of allied saboteurs are assigned an imp...,"During World War II, British forces launch an ..."


In [None]:
term_document_df_tfidf_x = vectorize_description(descriptions_df, 'description_x', TfidfVectorizer)
term_document_df_tfidf_x

['00' '000' '007' ... 'åÿan' 'åÿmaya' 'åžehnaz']


Unnamed: 0,imdb_id,title_x,release_year,00,000,007,05,10,100,1000,...,zor,zorro,zoya,zukijou,zuo,ã³n,ã¼ã,åÿan,åÿmaya,åžehnaz
0,tt0071853,Monty Python and the Holy Grail,1975,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0058385,My Fair Lady,1964,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt0080453,The Blue Lagoon,1980,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0061418,Bonnie and Clyde,1967,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0054953,The Guns of Navarone,1961,0.0,0.177354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2473,tt13657102,The Tambour of Retribution,2021,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2474,tt13879000,Pitta Kathalu,2021,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
term_document_df_tfidf_y = vectorize_description(descriptions_df, 'description_y', TfidfVectorizer)
term_document_df_tfidf_y

['000' '007' '10' ... 'álex' 'ángel' 'über']


Unnamed: 0,imdb_id,title_x,release_year,000,007,10,100,1000,11,12,...,zone,zoo,zoom,zorro,zozo,zuckerberg,zurich,álex,ángel,über
0,tt0071853,Monty Python and the Holy Grail,1975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0058385,My Fair Lady,1964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt0080453,The Blue Lagoon,1980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0061418,Bonnie and Clyde,1967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0054953,The Guns of Navarone,1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2473,tt13657102,The Tambour of Retribution,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2474,tt13879000,Pitta Kathalu,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
term_document_df_count_x = vectorize_description(descriptions_df, 'description_x', CountVectorizer)
term_document_df_count_x

['00' '000' '007' ... 'åÿan' 'åÿmaya' 'åžehnaz']


Unnamed: 0,imdb_id,title_x,release_year,00,000,007,05,10,100,1000,...,zor,zorro,zoya,zukijou,zuo,ã³n,ã¼ã,åÿan,åÿmaya,åžehnaz
0,tt0071853,Monty Python and the Holy Grail,1975,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt0058385,My Fair Lady,1964,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0080453,The Blue Lagoon,1980,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt0061418,Bonnie and Clyde,1967,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tt0054953,The Guns of Navarone,1961,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2473,tt13657102,The Tambour of Retribution,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2474,tt13879000,Pitta Kathalu,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
term_document_df_count_y = vectorize_description(descriptions_df, 'description_y', CountVectorizer)
term_document_df_count_y

['000' '007' '10' ... 'álex' 'ángel' 'über']


Unnamed: 0,imdb_id,title_x,release_year,000,007,10,100,1000,11,12,...,zone,zoo,zoom,zorro,zozo,zuckerberg,zurich,álex,ángel,über
0,tt0071853,Monty Python and the Holy Grail,1975,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt0058385,My Fair Lady,1964,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0080453,The Blue Lagoon,1980,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt0061418,Bonnie and Clyde,1967,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tt0054953,The Guns of Navarone,1961,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,tt14773250,Myriam Fares: The Journey,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2473,tt13657102,The Tambour of Retribution,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2474,tt13879000,Pitta Kathalu,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2475,tt14111708,Loyiso Gola: Unlearning,2021,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Apply Latent Dirichlet Allocation

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
for _, row in descriptions_df.iterrows():
  if 'â€“' in row['description_x']:
    print(row['title'])
    print(row['description_x'])

Final Destination 3
A student's premonition of a deadly rollercoaster ride saves her life and a lucky few, but not from death itself â€“ which seeks out those who escaped their fate.
Bleach the Movie: Hell Verse
Hell â€“ A place where beings that have committed mortal sins during their lifetime are sent. It is a realm where even Soul Reapers are forbidden to interfere. When a group of vicious Sinners plot to escape from this eternal prison, they discover that Substitute Soul Reaper Ichigo Kurosaki is the key to their freedom.
Luv Shuv Tey Chicken Khurana
On-the-run from the London mafia, Omi returns to his ancestral village â€“ a place he'd flown from with his grandad's money and grander â€˜London Dreams'. Will Omi be able to hide his failures while chasing love, and the lost recipe of the infamous dish, â€˜Chicken Khurana'?
Sabotage
John "Breacher" Wharton leads an elite DEA task force that takes on the world's deadliest drug cartels. When the team successfully executes a high-stakes 