<a href="https://colab.research.google.com/github/anthonybrown0528/csc-442-course-project/blob/main/notebook/hw6/hw6_vectorize_film_descriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Import pandas to access the dataset
import pandas as pd

# Import a string vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Dataset

In [13]:
dataset_path = 'https://raw.githubusercontent.com/anthonybrown0528/csc-442-course-project/refs/heads/main/dataset/clean/netflix_film_imdb_data.csv'
netflix_film_imdb_scores_df = pd.read_csv(dataset_path)

# Address text encoding errors

There are some meaningless characters in the film descriptions due to errors when encoding and decoding text from bytes. The most common errors can be identfied and corrected by mapping the meaningless errors to what they are expected to encode.

Source: https://www.i18nqa.com/debug/utf8-debug.html

In [14]:
encoding_mapping = {
    u'â€“': "—", # Use prefix to store unicode string. Source: https://docs.python.org/2/tutorial/introduction.html#unicode-strings
    u'â€œ': '"',
    u'â€ 	': '"',
    u'ãƒ™ã‚¤ãƒ–ãƒ¬ãƒ¼ãƒ‰ãƒãƒ¼ã‚¹ãƒˆGT(ã‚¬ãƒ': '', # non-latin characters (removed)
    u'à¤†à¤µà¤¾à¤°à¤¾ à¤ªà¤¾à¤—à¤² à¤¦à¥€à¤µà¤¾à¤¨à¤¾': '', # non-latin characters (removed)
    u'Ã©': 'é',
    u'Ã³': 'ó',
    u'â€™': "'"
}

def map_encoding(description):
  for key in encoding_mapping:
    prev_description = description
    description = description.replace(key, encoding_mapping[key])

  return description

netflix_film_imdb_scores_df['description_x'] = netflix_film_imdb_scores_df['description_x'].apply(map_encoding)

# Perform Lemmatization

In [15]:
# Source: https://www.nltk.org/api/nltk.stem.WordNetLemmatizer.html?highlight=wordnet
# Source: https://www.nltk.org/api/nltk.tokenize.sent_tokenize.html
# Source: https://www.nltk.org/book/ch05.html
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [16]:
# Source: https://cs.nyu.edu/~grishman/jet/guide/PennPOS.htm

lemmatizer = WordNetLemmatizer()

pos_mapping = {
    'NOUN': 'n',
    'ADV': 'a',
    'NUM': 'n',
    'PRON': 'n',
    'ADJ': 'a',
    'VERB': 'v',
    'PRT': 'n',
    'X': 'n',
    'ADP': 'n',
    'CONJ': 'n'
}

pos_ignore = {
  'DET',
  '.'
}

def lemmatize_description(description):
  tokens = word_tokenize(description)
  tokens = nltk.pos_tag(tokens, tagset='universal')

  lemma_sequence = []
  for token, pos in tokens:
    try:
      lemma = lemmatizer.lemmatize(token, pos=pos_mapping[pos])
    except:
      # Handle cases when the part of speech is not recognized
      if pos in pos_ignore or pos == token:
        # skip tokens that may not contribute to the meaning of the text
        continue

      # otherwise add the token without transformation
      lemma = token
    lemma_sequence.append(lemma)
  return ' '.join(lemma_sequence)

netflix_film_imdb_scores_df['description_x'] = netflix_film_imdb_scores_df['description_x'].apply(lemmatize_description)

In [17]:
netflix_film_imdb_scores_df.to_csv('netflix_film_imdb_scores_cleaned_descriptions_df.csv', index=False)