# Part 0 & 1

# Load dataset

In [1]:
from IPython import embed
from datasets import load_dataset
dataset = load_dataset('rotten_tomatoes')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = train_dataset.to_pandas()
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


# Create vocab from training dataset

In [3]:
import nltk


def build_vocab(train_dataset):
    # Create set, unique words only
    vocab = set()
    train_dataset_pos = []
    
    # Loop thru each sentence in training dataset
    for sentence in train_dataset['text']:
        # Basic text processing
        
        # Case folding
        sentence = sentence.lower()
        
        # NLTK tokenizer does a good job at separating meaningful words + punctuations
        # Better than defining regex ourselves
        word_list = nltk.tokenize.word_tokenize(sentence)
        
        # # Further split words into separate words
        # # e.g., 'well-being' -> 'well', 'being'
        # # e.g., 'music/song' -> 'music', 'song'
        # split_word_list = []
        # for word in sentence_list:
        #     split_word_list.extend(word.replace('-', ' ').replace('/', ' ').split())
        
        # Dont remove all special characters, some are meaningful
        # Some words are surrounded by single/double quotes
        word_list = [word.strip("'\"") for word in word_list]
        
        # Add into set
        vocab.update(word_list)
        
        # Get pos tags
        # Also build POS tags
        pos_tags = nltk.pos_tag(word_list)
        train_dataset_pos.append(pos_tags)
        
    vocab.discard('')
    return vocab, train_dataset_pos

In [4]:
vocab, train_dataset_pos = build_vocab(train_dataset)

print(vocab)



In [5]:
print(train_dataset_pos[0])

[('the', 'DT'), ('rock', 'NN'), ('is', 'VBZ'), ('destined', 'VBN'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN'), ('s', 'VBD'), ('new', 'JJ'), ('``', '``'), ('conan', 'JJ'), ('``', '``'), ('and', 'CC'), ('that', 'IN'), ('he', 'PRP'), ('s', 'VBZ'), ('going', 'VBG'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('splash', 'NN'), ('even', 'RB'), ('greater', 'JJR'), ('than', 'IN'), ('arnold', 'RB'), ('schwarzenegger', 'NN'), (',', ','), ('jean-claud', 'JJ'), ('van', 'NN'), ('damme', 'NN'), ('or', 'CC'), ('steven', 'JJ'), ('segal', 'NN'), ('.', '.')]


## (a) What is the size of the vocabulary formed from your training data?

In [6]:
print(f'Vocab size/Unique words: {len(vocab)}')

Vocab size/Unique words: 17841


# Part 1: Preparing Word Embeddings

## Download GloVe embeddings: https://nlp.stanford.edu/projects/glove/
- Uncased means all words are lowercase

In [7]:
# Load GloVe embeddings
import numpy as np

def load_glove_embeddings(path):
    glove_embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float64')
            glove_embeddings[word] = vector
            
    return glove_embeddings

In [8]:
glove_embeddings = load_glove_embeddings('glove.6B.50d.txt')

In [9]:
print(f'Glove embedding matrix for "the":\n {glove_embeddings["the"]}')

Glove embedding matrix for "the":
 [ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]


In [10]:
# Print size of the matrix
print(f'Number of unique words in embedding matrix: {len(glove_embeddings)}')

print(f'Number of Dimension/Features of Glove embedding matrix: {glove_embeddings["the"].shape[0]}')

Number of unique words in embedding matrix: 400000
Number of Dimension/Features of Glove embedding matrix: 50


## Create embedding matrix

In [11]:
# Finalize vocab
vocab_word_to_index = {word: idx for idx, word in enumerate(vocab)}

In [12]:
vocab_word_to_index

{'seams': 0,
 'not-so-bright': 1,
 'sees': 2,
 'toe-to-toe': 3,
 'options': 4,
 'subplots': 5,
 'gosto': 6,
 'whistles': 7,
 'campaign-trail': 8,
 'vinegar': 9,
 'antlers': 10,
 'zingers': 11,
 'floyd': 12,
 'laramie': 13,
 'muttering': 14,
 'pausing': 15,
 'transgression': 16,
 'advanced': 17,
 'pogue': 18,
 'insensitive': 19,
 'sniping': 20,
 'ruse': 21,
 'bros': 22,
 'parallel': 23,
 'ramshackle': 24,
 'wills': 25,
 'forwards': 26,
 'bit': 27,
 'dutifully': 28,
 'follows': 29,
 'tunisian': 30,
 'rah-rah': 31,
 'strangling': 32,
 'babbitt': 33,
 'nothing': 34,
 'fun': 35,
 'comprehension': 36,
 'fetid': 37,
 'shy': 38,
 'overblown': 39,
 'author': 40,
 'passe': 41,
 'breakthrough': 42,
 'loosely-connected': 43,
 'bone-crushing': 44,
 'tso': 45,
 'peaked': 46,
 'extent': 47,
 'tout': 48,
 'levy': 49,
 'cliché-riddled': 50,
 'smoke': 51,
 'bergmanesque': 52,
 'pretends': 53,
 'usually': 54,
 'majority-oriented': 55,
 'violence': 56,
 'undiscovered': 57,
 'warren': 58,
 'music': 59,
 'd

In [13]:
def create_embedding_matrix(word_to_index, glove_embeddings):
    # Initialize embedding matrix with zeros
    # 50d
    embedding_matrix = np.zeros((len(vocab), 50), dtype='float64')
    
    # Loop thru each word in vocab
    for word, idx in word_to_index.items():
        # Check if word exists in glove embeddings
        if word in glove_embeddings:
            # Copy glove embedding to embedding matrix
            embedding_matrix[idx] = glove_embeddings[word]
            # If OOV, assign None first
            
    return embedding_matrix

In [14]:
embedding_matrix = create_embedding_matrix(vocab_word_to_index, glove_embeddings)

In [15]:
embedding_matrix

array([[-0.49014 , -0.20967 ,  0.12486 , ..., -0.076962, -0.23988 ,
        -1.447   ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.24819 ,  0.51129 ,  0.18866 , ...,  0.08011 ,  0.29623 ,
         0.4851  ],
       ...,
       [ 0.21615 , -0.24888 , -0.17207 , ..., -1.0103  , -0.24482 ,
         1.3024  ],
       [-0.12263 ,  0.34502 , -0.81199 , ...,  0.63956 , -0.36351 ,
        -0.48047 ],
       [ 0.10389 , -1.2005  , -0.30101 , ..., -0.41072 ,  0.74173 ,
         0.59841 ]])

## (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

In [16]:
def get_oov_words(embedding_matrix, vocab_word_to_index):
    oov_words = []
    
    for word, idx in vocab_word_to_index.items():
        # Check if zero matrix
        if np.sum(embedding_matrix[idx]) == 0:
            oov_words.append(word)

    return oov_words

In [17]:
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

In [18]:
oov_words.sort()
oov_words[:200]

['-a',
 '-after',
 '-doing-it-for',
 '-greaseballs',
 '-hollywood',
 '-inevitable',
 '-of-the-week',
 '-stunning',
 '-the-cash',
 '-the-night',
 '-west',
 '-white',
 '1/2-hour',
 '102-minute',
 '10th-grade',
 '112-minute',
 '129-minute',
 '170-minute',
 '179-minute',
 '28k',
 '3-year-olds',
 '3/4th',
 '4/5ths',
 '60s-homage',
 '79-minute',
 '8217',
 '99-minute',
 'a-bornin',
 'a-knocking',
 'abandone',
 'aborbing',
 'absolutamente',
 'aburrido',
 'acabamos',
 'accomodates',
 'accordion/harmonica/banjo',
 'aceitou',
 'achival',
 'achronological',
 'acontecimentos',
 'acting-workshop',
 'action-fantasy',
 'action-filled',
 'action-movie',
 'action-thriller/dark',
 'action/comedy',
 'action/effects',
 'actorish',
 'actory',
 'actress-producer',
 'actuación',
 'actuada',
 'adapted-',
 'addessi',
 'adventues',
 'affectation-free',
 'affirmational',
 'again-courage',
 'age-inspired',
 'age-wise',
 'ages-old',
 "ain't-",
 'alientation',
 'all-in-all',
 'all-over-the-map',
 'all-too',
 'all-to

In [19]:
print(f"Number of OOV words: {len(oov_words)}")

Number of OOV words: 1668


In [20]:
train_dataset_pos

[[('the', 'DT'),
  ('rock', 'NN'),
  ('is', 'VBZ'),
  ('destined', 'VBN'),
  ('to', 'TO'),
  ('be', 'VB'),
  ('the', 'DT'),
  ('21st', 'JJ'),
  ('century', 'NN'),
  ('s', 'VBD'),
  ('new', 'JJ'),
  ('``', '``'),
  ('conan', 'JJ'),
  ('``', '``'),
  ('and', 'CC'),
  ('that', 'IN'),
  ('he', 'PRP'),
  ('s', 'VBZ'),
  ('going', 'VBG'),
  ('to', 'TO'),
  ('make', 'VB'),
  ('a', 'DT'),
  ('splash', 'NN'),
  ('even', 'RB'),
  ('greater', 'JJR'),
  ('than', 'IN'),
  ('arnold', 'RB'),
  ('schwarzenegger', 'NN'),
  (',', ','),
  ('jean-claud', 'JJ'),
  ('van', 'NN'),
  ('damme', 'NN'),
  ('or', 'CC'),
  ('steven', 'JJ'),
  ('segal', 'NN'),
  ('.', '.')],
 [('the', 'DT'),
  ('gorgeously', 'RB'),
  ('elaborate', 'JJ'),
  ('continuation', 'NN'),
  ('of', 'IN'),
  ('``', '``'),
  ('the', 'DT'),
  ('lord', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('rings', 'NNS'),
  ('``', '``'),
  ('trilogy', 'NN'),
  ('is', 'VBZ'),
  ('so', 'RB'),
  ('huge', 'JJ'),
  ('that', 'IN'),
  ('a', 'DT'),
  ('column', 'N

In [21]:
# Find POS tags for OOV words
oov_words_pos = {}
for oov_word in oov_words:
    # Find all possible POS for OOV word
    pos_tags = []
    for sentence_pos in train_dataset_pos:
        for word, pos in sentence_pos:
            if word == oov_word:
                pos_tags.append(pos)
    # # Assign
    oov_words_pos[oov_word] = pos_tags

In [22]:
oov_words_pos

{'-a': ['NNP'],
 '-after': ['NNP'],
 '-doing-it-for': ['JJ'],
 '-greaseballs': ['NNS'],
 '-hollywood': ['NN'],
 '-inevitable': ['JJ'],
 '-of-the-week': ['JJ'],
 '-stunning': ['NN'],
 '-the-cash': ['NNP'],
 '-the-night': ['NN'],
 '-west': ['JJS'],
 '-white': ['NN'],
 '1/2-hour': ['JJ'],
 '102-minute': ['JJ'],
 '10th-grade': ['JJ'],
 '112-minute': ['JJ'],
 '129-minute': ['JJ'],
 '170-minute': ['JJ'],
 '179-minute': ['JJ'],
 '28k': ['CD'],
 '3-year-olds': ['NNS'],
 '3/4th': ['CD'],
 '4/5ths': ['CD'],
 '60s-homage': ['JJ'],
 '79-minute': ['JJ'],
 '8217': ['CD', 'CD'],
 '99-minute': ['JJ'],
 'a-bornin': ['JJ'],
 'a-knocking': ['NN'],
 'abandone': ['NN'],
 'aborbing': ['VBG'],
 'absolutamente': ['NN'],
 'aburrido': ['NN'],
 'acabamos': ['NN'],
 'accomodates': ['VBZ'],
 'accordion/harmonica/banjo': ['NN'],
 'aceitou': ['NN'],
 'achival': ['JJ'],
 'achronological': ['NN', 'JJ'],
 'acontecimentos': ['NNS'],
 'acting-workshop': ['JJ'],
 'action-fantasy': ['JJ'],
 'action-filled': ['JJ'],
 'actio

## (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.

## Pass 1: Use Stemming to match OOV words of different forms to the same word in GloVe

In [23]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# Stem vocab
stemmed_glove_vocab = {ps.stem(word): word for word in glove_embeddings.keys()}

# Stem vocab and OOV words, find same word
def find_substitute_word_stem(oov_word, stemmed_glove_vocab):
    stemmed_oov_word = ps.stem(oov_word)
    if stemmed_oov_word in stemmed_glove_vocab.keys():
        return stemmed_glove_vocab[stemmed_oov_word]
    
    return None

In [24]:
# Copy substitute word embedding to OOV word
for oov_word in oov_words:
    best_substitute_word = find_substitute_word_stem(oov_word, stemmed_glove_vocab)
    if best_substitute_word:
        print(f'OOV word: {oov_word}, substitute word: {best_substitute_word}')
        embedding_matrix[vocab_word_to_index[oov_word]] = glove_embeddings[best_substitute_word]

OOV word: 3-year-olds, substitute word: 3-year-old
OOV word: abandone, substitute word: abandonned
OOV word: accomodates, substitute word: accomodative
OOV word: aceitou, substitute word: aceite
OOV word: affirmational, substitute word: affirmance
OOV word: allodi, substitute word: allodial
OOV word: anteing, substitute word: anting
OOV word: anti-date, substitute word: anti-de
OOV word: aqueles, substitute word: aquel
OOV word: birot, substitute word: birote
OOV word: bjorkness, substitute word: bjorke
OOV word: bug-eye, substitute word: bug-eyed
OOV word: butterfingered, substitute word: butterfingers
OOV word: by-the-books, substitute word: by-the-book
OOV word: cadness, substitute word: cads
OOV word: capturou, substitute word: captures
OOV word: cat-and-mouser, substitute word: cat-and-mouse
OOV word: cineasts, substitute word: cineaste
OOV word: co-dependence, substitute word: co-dependent
OOV word: colonics, substitute word: colonizations
OOV word: complejos, substitute word: co

In [25]:
# Print remaining OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

print(f"Number of OOV words left: {len(oov_words)}")

Number of OOV words left: 1527


## Pass 2: Use Wordnet synonyms bank based on sentence POS tagging

In [26]:
oov_words_pos

{'-a': ['NNP'],
 '-after': ['NNP'],
 '-doing-it-for': ['JJ'],
 '-greaseballs': ['NNS'],
 '-hollywood': ['NN'],
 '-inevitable': ['JJ'],
 '-of-the-week': ['JJ'],
 '-stunning': ['NN'],
 '-the-cash': ['NNP'],
 '-the-night': ['NN'],
 '-west': ['JJS'],
 '-white': ['NN'],
 '1/2-hour': ['JJ'],
 '102-minute': ['JJ'],
 '10th-grade': ['JJ'],
 '112-minute': ['JJ'],
 '129-minute': ['JJ'],
 '170-minute': ['JJ'],
 '179-minute': ['JJ'],
 '28k': ['CD'],
 '3-year-olds': ['NNS'],
 '3/4th': ['CD'],
 '4/5ths': ['CD'],
 '60s-homage': ['JJ'],
 '79-minute': ['JJ'],
 '8217': ['CD', 'CD'],
 '99-minute': ['JJ'],
 'a-bornin': ['JJ'],
 'a-knocking': ['NN'],
 'abandone': ['NN'],
 'aborbing': ['VBG'],
 'absolutamente': ['NN'],
 'aburrido': ['NN'],
 'acabamos': ['NN'],
 'accomodates': ['VBZ'],
 'accordion/harmonica/banjo': ['NN'],
 'aceitou': ['NN'],
 'achival': ['JJ'],
 'achronological': ['NN', 'JJ'],
 'acontecimentos': ['NNS'],
 'acting-workshop': ['JJ'],
 'action-fantasy': ['JJ'],
 'action-filled': ['JJ'],
 'actio

In [27]:
from nltk.corpus import wordnet as wn

pos_mapping_dict = {'NN':wn.NOUN,
              'JJ':wn.ADJ,
              'VB':wn.VERB,
              'RB':wn.ADV,
              # NLTK does not have wn.ADV_SAT
              }

# Convert oov_words_pos to wordnet POS
def map_pos_list(pos_list, mapping_dict):
    mapped_pos = []
    for pos in pos_list:
        # Find the first matching POS tag in mapping_dict by first letter comparison
        matched_pos = next((mapping_dict[key] for key in mapping_dict if key[0] == pos[0]), None)
        mapped_pos.append(matched_pos)
    return mapped_pos

# Creating the pos list
oov_words_pos_wordnet = {}
for key, pos_list in oov_words_pos.items():
    mapped_pos_list = map_pos_list(pos_list, pos_mapping_dict)
    
    # Remove None from list
    cleaned_list = [pos for pos in mapped_pos_list if pos is not None]
    
    # If all nones, remove whole entry
    if cleaned_list:
        oov_words_pos_wordnet[key] = cleaned_list
    

# Print the new dictionary
for key, value in oov_words_pos_wordnet.items():
    print(f"{key}: {value}")

-a: ['n']
-after: ['n']
-doing-it-for: ['a']
-greaseballs: ['n']
-hollywood: ['n']
-inevitable: ['a']
-of-the-week: ['a']
-stunning: ['n']
-the-cash: ['n']
-the-night: ['n']
-west: ['a']
-white: ['n']
1/2-hour: ['a']
102-minute: ['a']
10th-grade: ['a']
112-minute: ['a']
129-minute: ['a']
170-minute: ['a']
179-minute: ['a']
3-year-olds: ['n']
60s-homage: ['a']
79-minute: ['a']
99-minute: ['a']
a-bornin: ['a']
a-knocking: ['n']
abandone: ['n']
aborbing: ['v']
absolutamente: ['n']
aburrido: ['n']
acabamos: ['n']
accomodates: ['v']
accordion/harmonica/banjo: ['n']
aceitou: ['n']
achival: ['a']
achronological: ['n', 'a']
acontecimentos: ['n']
acting-workshop: ['a']
action-fantasy: ['a']
action-filled: ['a']
action-movie: ['a', 'a']
action-thriller/dark: ['a']
action/comedy: ['n']
action/effects: ['n']
actorish: ['a']
actory: ['a']
actress-producer: ['n']
actuada: ['n', 'a']
adapted-: ['a']
addessi: ['n']
adventues: ['n']
affectation-free: ['a']
affirmational: ['a']
again-courage: ['a']
age-

In [28]:
def find_substitute_wordnet_synonym(oov_word, oov_words_pos_wordnet):
    # Find synonyms for OOV words
    # Retrieve POS tags list
    try:
        pos_list = oov_words_pos_wordnet[oov_word]
    except KeyError:
        return None
    
    # Find number of unique pos, except None
    unique_pos = list(set(pos_list))
    # sort by count
    unique_pos.sort(key=lambda x: pos_list.count(x), reverse=True)
    
    # Loop thru each unique pos, try to find synonyms
    for possible_pos_tag in unique_pos:
        # Get synonyms
        for synset in wn.synsets(oov_word, pos=possible_pos_tag):
            for lemma in synset.lemmas():
                if lemma.name() in glove_embeddings:
                    print(f'OOV word: {oov_word}, synonym: {lemma.name()}')
                    return glove_embeddings[lemma.name()]

In [29]:
for oov_word in oov_words:
    synonym_glove_embedding = find_substitute_wordnet_synonym(oov_word, oov_words_pos_wordnet)
    if synonym_glove_embedding is not None:
        embedding_matrix[vocab_word_to_index[oov_word]] = synonym_glove_embedding

OOV word: punch-drunk, synonym: silly
OOV word: hopped-up, synonym: stoned
OOV word: corniest, synonym: corny
OOV word: tardier, synonym: belated
OOV word: teary-eyed, synonym: teary
OOV word: divertingly, synonym: amusingly
OOV word: razzle-dazzle, synonym: razzle
OOV word: old-hat, synonym: banal
OOV word: off-the-rack, synonym: off-the-shelf
OOV word: perfervid, synonym: ardent
OOV word: self-involved, synonym: self-absorbed
OOV word: excrescence, synonym: bulge
OOV word: cannier, synonym: cagey
OOV word: stuffiest, synonym: airless
OOV word: beseechingly, synonym: pleadingly
OOV word: greasiest, synonym: greasy
OOV word: rip-roaring, synonym: uproarious
OOV word: fine-looking, synonym: good-looking
OOV word: tongue-tied, synonym: incoherent
OOV word: ho-hum, synonym: boring
OOV word: well-meant, synonym: well-intentioned
OOV word: fuddled, synonym: befuddle
OOV word: ham-fisted, synonym: bumbling
OOV word: dewy-eyed, synonym: childlike
OOV word: wishy-washy, synonym: gutless
OOV wo

In [30]:
# Print remaining OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

print(f"Number of OOV words left: {len(oov_words)}")

Number of OOV words left: 1489


## Pass 3: Use Edit Distance to solve misspelled OOV words

In [31]:
from Levenshtein import distance as lev

# Find most similar word for OOV word
def find_substitute_word_edit_dist(oov_word, glove_embeddings):
    # Set to infinity first
    min_dist = float('inf')
    closest_word = None
    
    # Loop thru all words in glove embeddings
    for word in glove_embeddings:
        # Calculate edit distance
        dist = lev(oov_word, word)
        
        # Update if new minimum distance found
        if dist < min_dist:
            min_dist = dist
            closest_word = word
            
    return closest_word, min_dist

In [32]:
min_dist_thresh = 1

for oov_word in oov_words:
    best_substitute_word, min_dist = find_substitute_word_edit_dist(oov_word, glove_embeddings)
    if min_dist <= min_dist_thresh:
        print(f'OOV word: {oov_word}, substitute word: {best_substitute_word}, Distance: {min_dist}')
        # Copy substitute word embedding to OOV word
        embedding_matrix[vocab_word_to_index[oov_word]] = glove_embeddings[best_substitute_word]

OOV word: gosto, substitute word: gusto, Distance: 1
OOV word: toolbags, substitute word: toolbars, Distance: 1
OOV word: -stunning, substitute word: stunning, Distance: 1
OOV word: coma-like, substitute word: comb-like, Distance: 1
OOV word: espectáculo, substitute word: espectaculo, Distance: 1
OOV word: two-actor, substitute word: two-factor, Distance: 1
OOV word: four-, substitute word: four, Distance: 1
OOV word: marcken, substitute word: macken, Distance: 1
OOV word: post-colonialist, substitute word: post-colonialism, Distance: 1
OOV word: re-fried, substitute word: refried, Distance: 1
OOV word: thesps, substitute word: thesis, Distance: 1
OOV word: shakesperean, substitute word: shakespearean, Distance: 1
OOV word: government/, substitute word: government, Distance: 1
OOV word: class-, substitute word: class, Distance: 1
OOV word: próprio, substitute word: proprio, Distance: 1
OOV word: -hollywood, substitute word: hollywood, Distance: 1
OOV word: então, substitute word: antão

# Remaining OOV words

In [33]:
# Print remaining OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

print(f"Number of OOV words left: {len(oov_words)}")

Number of OOV words left: 1161


## Last pass: Replace OOV words with <UNK> token

In [34]:
# Get min max range of glove embeddings
all_embeddings = np.stack(list(glove_embeddings.values()))
min_val = np.min(all_embeddings)
max_val = np.max(all_embeddings)

print(f"min: {min_val}")
print(f"max: {max_val}")

mean_embedding = np.mean(all_embeddings)
std_embedding = np.std(all_embeddings)

print(f"Mean: {mean_embedding}, Standard Deviation: {std_embedding}")

min: -5.4593
max: 5.3101
Mean: 0.020940489508694315, Standard Deviation: 0.6441042976813115


In [35]:
unk_token = '<UNK>'
# Random embedding for <UNK> token
embedding_dim = 50
unk_embedding = np.random.uniform(-0.5, 0.5, embedding_dim)

# Assign <UNK> token embedding to OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)
for oov_word in oov_words:
    embedding_matrix[vocab_word_to_index[oov_word]] = unk_embedding
    
# Add <UNK> token to vocab and embedding matrix
vocab_word_to_index[unk_token] = len(vocab)
embedding_matrix = np.vstack([embedding_matrix, unk_embedding])

In [36]:
embedding_matrix[17841]

array([ 0.01513297,  0.2400952 , -0.13676383,  0.13166569, -0.28283166,
        0.10421129,  0.39747017,  0.07944959,  0.29670785,  0.05400998,
        0.48425894,  0.26516231, -0.48021244, -0.25129253, -0.24367068,
       -0.24188322,  0.47579495, -0.2097357 , -0.02568224, -0.31143999,
       -0.3196337 ,  0.44878632, -0.07379564,  0.32765833, -0.49052161,
       -0.33455611, -0.34772199, -0.05043562, -0.0898296 ,  0.04898804,
        0.4993778 ,  0.04359836,  0.40077601, -0.31343237,  0.24126281,
       -0.4907152 , -0.20372591, -0.32123346, -0.39554707,  0.37386547,
        0.44720326,  0.45492689, -0.16420979,  0.42844699,  0.15748723,
       -0.23547929, -0.33962153,  0.04243802, -0.03647524, -0.0042893 ])

## Alternative methods to test
- FastText, break words into subwords and use subword embeddings
- semantic similarity with cosine similarity
- use model to predict oov embeddings Mimick https://github.com/yuvalpinter/Mimick


# Finally, save embedding matrix and vocab_to_index mapping

In [37]:
import pickle


with open('embedding_matrix.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)
    
with open('vocab_word_to_index.pkl', 'wb') as f:
    pickle.dump(vocab_word_to_index, f)