# Part 0 & 1

# Load dataset

In [1]:
from datasets import load_dataset
dataset = load_dataset ('rotten_tomatoes')
train_dataset = dataset ['train']
validation_dataset = dataset ['validation']
test_dataset = dataset ['test']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = train_dataset.to_pandas()
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


# Create vocab from training dataset

In [17]:
import string

def build_vocab(train_dataset):
    # Create set, unique words only
    vocab = set()
    # Loop thru each sentence in training dataset
    for sentence in train_dataset['text']:
        # Add into set
        vocab.update(sentence.split())
            
    vocab.discard('')
    return vocab

In [18]:
vocab = build_vocab(train_dataset)

## (a) What is the size of the vocabulary formed from your training data?

In [20]:
print(f'Vocab size/Unique words: {len(vocab)}')

Vocab size/Unique words: 18951


# Part 1: Preparing Word Embeddings

## Download GloVe embeddings: https://nlp.stanford.edu/projects/glove/
- Uncased means all words are lowercase

In [21]:
# Load GloVe embeddings
import numpy as np

def load_glove_embeddings(path):
    glove_embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float64')
            glove_embeddings[word] = vector
            
    return glove_embeddings

In [22]:
glove_embeddings = load_glove_embeddings('glove.6B.50d.txt')

In [23]:
print(f'Glove embedding matrix for "the":\n {glove_embeddings["the"]}')

Glove embedding matrix for "the":
 [ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]


In [24]:
# Print size of the matrix
print(f'Number of unique words in embedding matrix: {len(glove_embeddings)}')

print(f'Number of Dimension/Features of Glove embedding matrix: {glove_embeddings["the"].shape[0]}')

Number of unique words in embedding matrix: 400000
Number of Dimension/Features of Glove embedding matrix: 50


## (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

## Method 1: Check difference directly

In [52]:
def get_oov_words(vocab, glove_embeddings):
    oov_words = []
    
    for word in vocab:
        if word not in glove_embeddings:
            oov_words.append(word)
            
    return oov_words

In [53]:
oov_words = get_oov_words(vocab, glove_embeddings)

In [54]:
oov_words.sort()
oov_words[:100]

['#3',
 '#9',
 '$1',
 '$100',
 '$20',
 '$40',
 '$50-million',
 '$7',
 '$9',
 '$99',
 "''independent",
 "'50's",
 "'60s-homage",
 "'70's",
 "'[hopkins]doesn't",
 "'[the",
 "'a",
 "'a'",
 "'abandon",
 "'ace",
 "'action",
 "'ah",
 "'alabama'",
 "'all",
 "'alternate",
 "'amateur'",
 "'analyze",
 "'angels",
 "'anyone",
 "'are",
 "'artistically'",
 "'artístico'",
 "'assassin'",
 "'aunque",
 "'b'",
 "'bad'",
 "'baran",
 "'barbershop",
 "'barbershop'",
 "'bartleby'",
 "'been",
 "'belgium's",
 "'best",
 "'blade",
 "'blood'",
 "'blue",
 "'blundering'",
 "'bold'",
 "'bowling",
 "'brazil",
 "'butterfingered'",
 "'carente",
 "'challenging'",
 "'chan",
 "'charly'",
 "'chick",
 "'children's'",
 "'chops'",
 "'christian",
 "'classic",
 "'classic'",
 "'co-stars",
 "'comedian'",
 "'comedy",
 "'compleja",
 "'cq",
 "'credit'",
 "'cultural",
 "'date",
 "'de",
 "'deadly",
 "'difficult'",
 "'divertida",
 "'do",
 "'dog'",
 "'dragonfly'",
 "'drama",
 "'drumline'",
 "'dumb",
 "'e'",
 "'easier'",
 "'easily",
 "'e

In [55]:
len(oov_words)

3036

## Method 2: Train tokenizer and check OOV words

In [56]:
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_dataset['text'])

In [57]:
print(f"Number of unique words in dictionary= {len(tokenizer.word_index)}")

Number of unique words in dictionary= 17451


In [58]:
oov_words = []
for word in tokenizer.word_index:
    if word not in glove_embeddings:
        oov_words.append(word)

In [59]:
oov_words.sort()
oov_words[:100]

["''independent",
 "'50's",
 "'70's",
 "'a",
 "'a'",
 "'abandon",
 "'ace",
 "'action",
 "'ah",
 "'alabama'",
 "'all",
 "'alternate",
 "'amateur'",
 "'analyze",
 "'angels",
 "'anyone",
 "'are",
 "'artistically'",
 "'artístico'",
 "'assassin'",
 "'aunque",
 "'b'",
 "'bad'",
 "'baran",
 "'barbershop",
 "'barbershop'",
 "'bartleby'",
 "'been",
 "'belgium's",
 "'best",
 "'blade",
 "'blood'",
 "'blue",
 "'blundering'",
 "'bold'",
 "'bowling",
 "'brazil",
 "'butterfingered'",
 "'carente",
 "'challenging'",
 "'chan",
 "'charly'",
 "'chick",
 "'children's'",
 "'chops'",
 "'christian",
 "'classic",
 "'classic'",
 "'co",
 "'comedian'",
 "'comedy",
 "'compleja",
 "'cq",
 "'credit'",
 "'cultural",
 "'date",
 "'de",
 "'deadly",
 "'difficult'",
 "'divertida",
 "'do",
 "'dog'",
 "'dragonfly'",
 "'drama",
 "'drumline'",
 "'dumb",
 "'e'",
 "'easier'",
 "'easily",
 "'edgy",
 "'ejemplo",
 "'empowerment",
 "'enigma'",
 "'epic",
 "'estupendamente",
 "'evelyn",
 "'face",
 "'fatal",
 "'film",
 "'fish",
 "'fra

In [60]:
print(f"Number of OOV words= {len(oov_words)}")

Number of OOV words= 1760


## (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.

Most of the OOV words consists of punctuations, numbers, and special characters.