# Part 0 & 1

# Load dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset('rotten_tomatoes')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [2]:
train_df = train_dataset.to_pandas()

In [3]:
# # Load augmented dataset
# import pandas as pd
# 
# train_df = pd.read_csv('augmented_combined_train_dataset.csv')

In [4]:
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


# Create vocab from training dataset

In [5]:
import nltk


def build_vocab(train_dataset):
    # Create set, unique words only
    vocab = set()
    train_dataset_pos = []
    
    # Loop thru each sentence in training dataset
    for sentence in train_dataset['text']:
        # Basic text processing
        
        # Case folding
        sentence = sentence.lower()
        
        # NLTK tokenizer does a good job at separating meaningful words + punctuations
        # Better than defining regex ourselves
        word_list = nltk.tokenize.word_tokenize(sentence)
        
        # Further split words into separate words
        # e.g., 'well-being' -> 'well', 'being'
        # e.g., 'music/song' -> 'music', 'song'
        split_word_list = []
        for word in word_list:
            # If the word contains a hyphen or a slash, split it further
            if '-' in word or '/' in word:
                split_word_list.extend(word.replace('-', ' ').replace('/', ' ').split())
            else:
                split_word_list.append(word)
        
        # Dont remove all special characters, some are meaningful
        # Some words are surrounded by single/double quotes
        final_word_list = [word.strip("'\"") for word in split_word_list]
        
        # Add into set
        vocab.update(final_word_list)
        
        # Get pos tags
        # Also build POS tags
        pos_tags = nltk.pos_tag(final_word_list)
        train_dataset_pos.append(pos_tags)
        
    vocab.discard('')
    return vocab, train_dataset_pos

In [6]:
vocab, train_dataset_pos = build_vocab(train_df)

print(vocab)



In [7]:
print(train_dataset_pos[0])

[('the', 'DT'), ('rock', 'NN'), ('is', 'VBZ'), ('destined', 'VBN'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN'), ('s', 'VBD'), ('new', 'JJ'), ('``', '``'), ('conan', 'JJ'), ('``', '``'), ('and', 'CC'), ('that', 'IN'), ('he', 'PRP'), ('s', 'VBZ'), ('going', 'VBG'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('splash', 'NN'), ('even', 'RB'), ('greater', 'JJR'), ('than', 'IN'), ('arnold', 'RB'), ('schwarzenegger', 'NN'), (',', ','), ('jean', 'JJ'), ('claud', 'NN'), ('van', 'NN'), ('damme', 'NN'), ('or', 'CC'), ('steven', 'JJ'), ('segal', 'NN'), ('.', '.')]


## (a) What is the size of the vocabulary formed from your training data?

In [8]:
print(f'Vocab size/Unique words: {len(vocab)}')

Vocab size/Unique words: 17884


# Part 1: Preparing Word Embeddings

## Download GloVe embeddings: https://nlp.stanford.edu/projects/glove/
- Uncased means all words are lowercase

In [9]:
# Load GloVe embeddings
import numpy as np

def load_glove_embeddings(path):
    glove_embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float64')
            glove_embeddings[word] = vector
            
    return glove_embeddings

In [10]:
glove_embeddings = load_glove_embeddings('glove.6B.200d.txt')

In [11]:
print(f'Glove embedding matrix for "the":\n {glove_embeddings["the"]}')

Glove embedding matrix for "the":
 [-7.1549e-02  9.3459e-02  2.3738e-02 -9.0339e-02  5.6123e-02  3.2547e-01
 -3.9796e-01 -9.2139e-02  6.1181e-02 -1.8950e-01  1.3061e-01  1.4349e-01
  1.1479e-02  3.8158e-01  5.4030e-01 -1.4088e-01  2.4315e-01  2.3036e-01
 -5.5339e-01  4.8154e-02  4.5662e-01  3.2338e+00  2.0199e-02  4.9019e-02
 -1.4132e-02  7.6017e-02 -1.1527e-01  2.0060e-01 -7.7657e-02  2.4328e-01
  1.6368e-01 -3.4118e-01 -6.6070e-02  1.0152e-01  3.8232e-02 -1.7668e-01
 -8.8153e-01 -3.3895e-01 -3.5481e-02 -5.5095e-01 -1.6899e-02 -4.3982e-01
  3.9004e-02  4.0447e-01 -2.5880e-01  6.4594e-01  2.6641e-01  2.8009e-01
 -2.4625e-02  6.3302e-01 -3.1700e-01  1.0271e-01  3.0886e-01  9.7792e-02
 -3.8227e-01  8.6552e-02  4.7075e-02  2.3511e-01 -3.2127e-01 -2.8538e-01
  1.6670e-01 -4.9707e-03 -6.2714e-01 -2.4904e-01  2.9713e-01  1.4379e-01
 -1.2325e-01 -5.8178e-02 -1.0290e-03 -8.2126e-02  3.6935e-01 -5.8442e-04
  3.4286e-01  2.8426e-01 -6.8599e-02  6.5747e-01 -2.9087e-02  1.6184e-01
  7.3672e-02 -3.

In [12]:
# Print size of the matrix
print(f'Number of unique words in embedding matrix: {len(glove_embeddings)}')

print(f'Number of Dimension/Features of Glove embedding matrix: {glove_embeddings["the"].shape[0]}')

Number of unique words in embedding matrix: 400000
Number of Dimension/Features of Glove embedding matrix: 200


## Create embedding matrix

In [13]:
# Finalize vocab
vocab_word_to_index = {word: idx for idx, word in enumerate(vocab)}

In [14]:
vocab_word_to_index

{'determination': 0,
 'simple': 1,
 'taking': 2,
 'sooner': 3,
 'timeless': 4,
 'subtlety': 5,
 'sleepwalk': 6,
 'ostentatious': 7,
 'shtick': 8,
 'snow': 9,
 'with': 10,
 'sadistic': 11,
 'koury': 12,
 'haneke': 13,
 'rachel': 14,
 'loathe': 15,
 'harvests': 16,
 'whine': 17,
 'nap': 18,
 'simply': 19,
 'button': 20,
 'que': 21,
 'büttner': 22,
 'progression': 23,
 'enriches': 24,
 'bleakly': 25,
 'brosnan': 26,
 'sending': 27,
 'dodgy': 28,
 'alarms': 29,
 'conniving': 30,
 'deficiency': 31,
 'appears': 32,
 'image': 33,
 'breeziness': 34,
 'preoccupied': 35,
 'directed': 36,
 'underlay': 37,
 'midlife': 38,
 'indomitable': 39,
 'psychologizing': 40,
 'glitter': 41,
 'points': 42,
 'duck': 43,
 'induced': 44,
 'realistically': 45,
 'holiday': 46,
 'struggling': 47,
 'contain': 48,
 'tenderly': 49,
 'star': 50,
 'jeopardy': 51,
 'idiots': 52,
 'schaeffer': 53,
 'meyer': 54,
 'claw': 55,
 'goers': 56,
 'playful': 57,
 'triumph': 58,
 'accumulated': 59,
 'judgment': 60,
 'questions': 61

In [15]:
def create_embedding_matrix(word_to_index, glove_embeddings):
    # Initialize embedding matrix with zeros
    # 50d
    embedding_matrix = np.zeros((len(vocab), 200), dtype='float64')
    
    # Loop thru each word in vocab
    for word, idx in word_to_index.items():
        # Check if word exists in glove embeddings
        if word in glove_embeddings:
            # Copy glove embedding to embedding matrix
            embedding_matrix[idx] = glove_embeddings[word]
            # If OOV, assign None first
            
    return embedding_matrix

In [16]:
embedding_matrix = create_embedding_matrix(vocab_word_to_index, glove_embeddings)

In [17]:
embedding_matrix

array([[ 0.25093  ,  0.83451  ,  0.25677  , ...,  0.31425  , -0.24449  ,
        -0.0023992],
       [ 0.57959  ,  0.14576  ,  0.32607  , ...,  0.050995 , -0.24176  ,
        -0.1596   ],
       [-0.049447 ,  0.14972  , -0.2371   , ..., -0.11361  ,  0.048788 ,
        -0.19525  ],
       ...,
       [ 0.51337  , -0.081174 ,  0.044873 , ...,  0.25791  ,  0.15017  ,
         0.27543  ],
       [ 0.92436  , -0.15478  , -0.49867  , ..., -0.067343 ,  0.44588  ,
         0.060262 ],
       [-0.38574  ,  0.77289  , -0.22022  , ..., -0.5598   ,  0.23399  ,
         0.16738  ]])

## (b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

In [18]:
def get_oov_words(embedding_matrix, vocab_word_to_index):
    oov_words = []
    
    for word, idx in vocab_word_to_index.items():
        # Check if zero matrix
        if np.sum(embedding_matrix[idx]) == 0:
            oov_words.append(word)

    return oov_words

In [19]:
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

In [20]:
oov_words.sort()
oov_words[:200]

['1899.',
 '1952.',
 '1975.',
 '1995.',
 '2002.',
 '28k',
 '5ths',
 '8217',
 'abandone',
 'aborbing',
 'absolutamente',
 'aburrido',
 'acabamos',
 'accomodates',
 'aceitou',
 'achival',
 'achronological',
 'acidically',
 'acontecimentos',
 'actorious',
 'actorish',
 'actory',
 'actuación',
 'actuada',
 'addessi',
 'adorability',
 'adventues',
 'affirmational',
 "ain't",
 'alientation',
 'allodi',
 'amoses',
 'amusedly',
 'andamento',
 'animé',
 'anteing',
 'apallingly',
 'apesar',
 'aproveitar',
 'aqueles',
 "aren't",
 'arriesgado',
 'artnering',
 'artsploitation',
 'artístico',
 'assistir',
 'atacar',
 'atacarse',
 'atreve',
 'auteil',
 'autocritique',
 'awakener',
 'b+',
 'bazadona',
 'bergmanesque',
 'beseechingly',
 'bibbidy',
 'bierbichler',
 'birot',
 'bizzarre',
 'bjorkness',
 'blighter',
 'blutarsky',
 'bobbidi',
 'bondish',
 'bornin',
 'bottomlessly',
 'bruckheimeresque',
 'brûlée',
 "bull's",
 'burningly',
 'bustingly',
 'butterfingered',
 'cadness',
 "cam'ron",
 'camareras',

In [21]:
print(f"Number of OOV words: {len(oov_words)}")

Number of OOV words: 678


In [22]:
train_dataset_pos

[[('the', 'DT'),
  ('rock', 'NN'),
  ('is', 'VBZ'),
  ('destined', 'VBN'),
  ('to', 'TO'),
  ('be', 'VB'),
  ('the', 'DT'),
  ('21st', 'JJ'),
  ('century', 'NN'),
  ('s', 'VBD'),
  ('new', 'JJ'),
  ('``', '``'),
  ('conan', 'JJ'),
  ('``', '``'),
  ('and', 'CC'),
  ('that', 'IN'),
  ('he', 'PRP'),
  ('s', 'VBZ'),
  ('going', 'VBG'),
  ('to', 'TO'),
  ('make', 'VB'),
  ('a', 'DT'),
  ('splash', 'NN'),
  ('even', 'RB'),
  ('greater', 'JJR'),
  ('than', 'IN'),
  ('arnold', 'RB'),
  ('schwarzenegger', 'NN'),
  (',', ','),
  ('jean', 'JJ'),
  ('claud', 'NN'),
  ('van', 'NN'),
  ('damme', 'NN'),
  ('or', 'CC'),
  ('steven', 'JJ'),
  ('segal', 'NN'),
  ('.', '.')],
 [('the', 'DT'),
  ('gorgeously', 'RB'),
  ('elaborate', 'JJ'),
  ('continuation', 'NN'),
  ('of', 'IN'),
  ('``', '``'),
  ('the', 'DT'),
  ('lord', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('rings', 'NNS'),
  ('``', '``'),
  ('trilogy', 'NN'),
  ('is', 'VBZ'),
  ('so', 'RB'),
  ('huge', 'JJ'),
  ('that', 'IN'),
  ('a', 'DT'),
  

In [23]:
# Find POS tags for OOV words
oov_words_pos = {}
for oov_word in oov_words:
    # Find all possible POS for OOV word
    pos_tags = []
    for sentence_pos in train_dataset_pos:
        for word, pos in sentence_pos:
            if word == oov_word:
                pos_tags.append(pos)
    # # Assign
    oov_words_pos[oov_word] = pos_tags

In [24]:
oov_words_pos

{'1899.': ['CD'],
 '1952.': ['CD'],
 '1975.': ['CD'],
 '1995.': ['CD'],
 '2002.': ['CD', 'CD'],
 '28k': ['CD', 'CD'],
 '5ths': ['CD', 'CD'],
 '8217': ['CD', 'CD'],
 'abandone': ['NN'],
 'aborbing': ['VBG'],
 'absolutamente': ['NN'],
 'aburrido': ['NN'],
 'acabamos': ['NN'],
 'accomodates': ['VBZ'],
 'aceitou': ['NN'],
 'achival': ['JJ'],
 'achronological': ['NN', 'JJ'],
 'acidically': ['RB'],
 'acontecimentos': ['NNS'],
 'actorious': ['JJ'],
 'actorish': ['JJ'],
 'actory': ['JJ'],
 'actuación': ['FW'],
 'actuada': ['NN', 'JJ'],
 'addessi': ['NN', 'RB'],
 'adorability': ['NN', 'NN'],
 'adventues': ['NNS'],
 'affirmational': ['JJ'],
 "ain't": ['IN'],
 'alientation': ['NN', 'NN'],
 'allodi': ['JJ', 'RB'],
 'amoses': ['NNS', 'NNS'],
 'amusedly': ['RB'],
 'andamento': ['NN'],
 'animé': ['JJ', 'NN'],
 'anteing': ['VBG'],
 'apallingly': ['RB'],
 'apesar': ['NN'],
 'aproveitar': ['NN'],
 'aqueles': ['NNS'],
 "aren't": ['JJ'],
 'arriesgado': ['FW'],
 'artnering': ['VBG'],
 'artsploitation': ['N

## (c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.

## Pass 1: Use Stemming to match OOV words of different forms to the same word in GloVe

- Use snowball stemmer, as it is more aggressive than porter stemmer. Handles -ly words better

In [25]:
# from nltk.stem import PorterStemmer
# ps = PorterStemmer()

# from nltk.stem import SnowballStemmer
# ss = SnowballStemmer('english')

from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

# Stem vocab
stemmed_glove_vocab = {ls.stem(word): word for word in glove_embeddings.keys()}

# Stem vocab and OOV words, find same word
def find_substitute_word_stem(oov_word, stemmed_glove_vocab):
    stemmed_oov_word = ls.stem(oov_word)
    if stemmed_oov_word in stemmed_glove_vocab.keys():
        return stemmed_glove_vocab[stemmed_oov_word]
    
    return None

In [26]:
# Copy substitute word embedding to OOV word
for oov_word in oov_words:
    best_substitute_word = find_substitute_word_stem(oov_word, stemmed_glove_vocab)
    if best_substitute_word:
        print(f'OOV word: {oov_word}, substitute word: {best_substitute_word}')
        embedding_matrix[vocab_word_to_index[oov_word]] = glove_embeddings[best_substitute_word]

OOV word: abandone, substitute word: abandonar
OOV word: accomodates, substitute word: accomodative
OOV word: achival, substitute word: acher
OOV word: acidically, substitute word: acidulated
OOV word: actorious, substitute word: actionism
OOV word: actorish, substitute word: actionism
OOV word: actory, substitute word: actionism
OOV word: adorability, substitute word: adag
OOV word: affirmational, substitute word: affirmance
OOV word: alientation, substitute word: aliis
OOV word: allodi, substitute word: allodial
OOV word: amusedly, substitute word: amusa
OOV word: anteing, substitute word: antiseizure
OOV word: apallingly, substitute word: apala
OOV word: aqueles, substitute word: aquel
OOV word: atacar, substitute word: atac
OOV word: awakener, substitute word: awaking
OOV word: beseechingly, substitute word: beseeches
OOV word: birot, substitute word: birote
OOV word: bjorkness, substitute word: bjorke
OOV word: blighter, substitute word: blighting
OOV word: bondish, substitute wor

In [27]:
# Print remaining OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

print(f"Number of OOV words left: {len(oov_words)}")

Number of OOV words left: 420


## Pass 2: Use Wordnet synonyms bank based on sentence POS tagging

In [28]:
oov_words_pos

{'1899.': ['CD'],
 '1952.': ['CD'],
 '1975.': ['CD'],
 '1995.': ['CD'],
 '2002.': ['CD', 'CD'],
 '28k': ['CD', 'CD'],
 '5ths': ['CD', 'CD'],
 '8217': ['CD', 'CD'],
 'abandone': ['NN'],
 'aborbing': ['VBG'],
 'absolutamente': ['NN'],
 'aburrido': ['NN'],
 'acabamos': ['NN'],
 'accomodates': ['VBZ'],
 'aceitou': ['NN'],
 'achival': ['JJ'],
 'achronological': ['NN', 'JJ'],
 'acidically': ['RB'],
 'acontecimentos': ['NNS'],
 'actorious': ['JJ'],
 'actorish': ['JJ'],
 'actory': ['JJ'],
 'actuación': ['FW'],
 'actuada': ['NN', 'JJ'],
 'addessi': ['NN', 'RB'],
 'adorability': ['NN', 'NN'],
 'adventues': ['NNS'],
 'affirmational': ['JJ'],
 "ain't": ['IN'],
 'alientation': ['NN', 'NN'],
 'allodi': ['JJ', 'RB'],
 'amoses': ['NNS', 'NNS'],
 'amusedly': ['RB'],
 'andamento': ['NN'],
 'animé': ['JJ', 'NN'],
 'anteing': ['VBG'],
 'apallingly': ['RB'],
 'apesar': ['NN'],
 'aproveitar': ['NN'],
 'aqueles': ['NNS'],
 "aren't": ['JJ'],
 'arriesgado': ['FW'],
 'artnering': ['VBG'],
 'artsploitation': ['N

In [29]:
from nltk.corpus import wordnet as wn

pos_mapping_dict = {'NN':wn.NOUN,
              'JJ':wn.ADJ,
              'VB':wn.VERB,
              'RB':wn.ADV,
              # NLTK does not have wn.ADV_SAT
              }

# Convert oov_words_pos to wordnet POS
def map_pos_list(pos_list, mapping_dict):
    mapped_pos = []
    for pos in pos_list:
        # Find the first matching POS tag in mapping_dict by first letter comparison
        matched_pos = next((mapping_dict[key] for key in mapping_dict if key[0] == pos[0]), None)
        mapped_pos.append(matched_pos)
    return mapped_pos

# Creating the pos list
oov_words_pos_wordnet = {}
for key, pos_list in oov_words_pos.items():
    mapped_pos_list = map_pos_list(pos_list, pos_mapping_dict)
    
    # Remove None from list
    cleaned_list = [pos for pos in mapped_pos_list if pos is not None]
    
    # If all nones, remove whole entry
    if cleaned_list:
        oov_words_pos_wordnet[key] = cleaned_list
    

# Print the new dictionary
for key, value in oov_words_pos_wordnet.items():
    print(f"{key}: {value}")

abandone: ['n']
aborbing: ['v']
absolutamente: ['n']
aburrido: ['n']
acabamos: ['n']
accomodates: ['v']
aceitou: ['n']
achival: ['a']
achronological: ['n', 'a']
acidically: ['r']
acontecimentos: ['n']
actorious: ['a']
actorish: ['a']
actory: ['a']
actuada: ['n', 'a']
addessi: ['n', 'r']
adorability: ['n', 'n']
adventues: ['n']
affirmational: ['a']
alientation: ['n', 'n']
allodi: ['a', 'r']
amoses: ['n', 'n']
amusedly: ['r']
andamento: ['n']
animé: ['a', 'n']
anteing: ['v']
apallingly: ['r']
apesar: ['n']
aproveitar: ['n']
aqueles: ['n']
aren't: ['a']
artnering: ['v']
artsploitation: ['n']
artístico: ['a']
atacar: ['n']
atacarse: ['a']
atreve: ['v']
auteil: ['n', 'a', 'n', 'n']
autocritique: ['n']
awakener: ['n']
b+: ['n']
bazadona: ['n', 'n']
bergmanesque: ['a']
beseechingly: ['r']
bibbidy: ['n']
bierbichler: ['n']
birot: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
bizzarre: ['a']
bjorkness: ['n']
blighter: ['n']
blutarsky: ['n']
bobbidi: ['n']
bondish: ['a']
bornin: [

In [30]:
def find_substitute_wordnet_synonym(oov_word, oov_words_pos_wordnet):
    # Find synonyms for OOV words
    # Retrieve POS tags list
    try:
        pos_list = oov_words_pos_wordnet[oov_word]
    except KeyError:
        return None
    
    # Find number of unique pos, except None
    unique_pos = list(set(pos_list))
    # sort by count
    unique_pos.sort(key=lambda x: pos_list.count(x), reverse=True)
    
    # Loop thru each unique pos, try to find synonyms
    for possible_pos_tag in unique_pos:
        # Get synonyms
        for synset in wn.synsets(oov_word, pos=possible_pos_tag):
            for lemma in synset.lemmas():
                if lemma.name() in glove_embeddings:
                    print(f'OOV word: {oov_word}, synonym: {lemma.name()}')
                    return glove_embeddings[lemma.name()]

In [31]:
for oov_word in oov_words:
    synonym_glove_embedding = find_substitute_wordnet_synonym(oov_word, oov_words_pos_wordnet)
    if synonym_glove_embedding is not None:
        embedding_matrix[vocab_word_to_index[oov_word]] = synonym_glove_embedding

OOV word: juiceless, synonym: dry
OOV word: shmear, synonym: schmear
OOV word: perfervid, synonym: ardent
OOV word: excrescence, synonym: bulge
OOV word: stuffiest, synonym: airless
OOV word: greasiest, synonym: greasy
OOV word: fuddled, synonym: befuddle
OOV word: corniest, synonym: corny


In [32]:
# Print remaining OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

print(f"Number of OOV words left: {len(oov_words)}")

Number of OOV words left: 412


## Pass 3: Use Edit Distance to solve misspelled OOV words

In [33]:
from Levenshtein import distance as lev

# Find most similar word for OOV word
def find_substitute_word_edit_dist(oov_word, glove_embeddings):
    # Set to infinity first
    min_dist = float('inf')
    closest_word = None
    
    # Loop thru all words in glove embeddings
    for word in glove_embeddings:
        # Calculate edit distance
        dist = lev(oov_word, word)
        
        # Update if new minimum distance found
        if dist < min_dist:
            min_dist = dist
            closest_word = word
            
    return closest_word, min_dist

In [34]:
min_dist_thresh = 1

for oov_word in oov_words:
    best_substitute_word, min_dist = find_substitute_word_edit_dist(oov_word, glove_embeddings)
    if min_dist <= min_dist_thresh:
        print(f'OOV word: {oov_word}, substitute word: {best_substitute_word}, Distance: {min_dist}')
        # Copy substitute word embedding to OOV word
        embedding_matrix[vocab_word_to_index[oov_word]] = glove_embeddings[best_substitute_word]

OOV word: shrieky, substitute word: shrieks, Distance: 1
OOV word: fun's, substitute word: funds, Distance: 1
OOV word: zillionth, substitute word: millionth, Distance: 1
OOV word: mcklusky, substitute word: mcclusky, Distance: 1
OOV word: bizzarre, substitute word: bizarre, Distance: 1
OOV word: what, substitute word: what, Distance: 1
OOV word: enviará, substitute word: enviar, Distance: 1
OOV word: manipulador, substitute word: manipulator, Distance: 1
OOV word: possui, substitute word: possum, Distance: 1
OOV word: csoka, substitute word: cska, Distance: 1
OOV word: pérdida, substitute word: perdida, Distance: 1
OOV word: slowtime, substitute word: showtime, Distance: 1
OOV word: adventues, substitute word: adventures, Distance: 1
OOV word: retrata, substitute word: retrato, Distance: 1
OOV word: delibrately, substitute word: deliberately, Distance: 1
OOV word: time., substitute word: time, Distance: 1
OOV word: espectáculo, substitute word: espectaculo, Distance: 1
OOV word: skee

# Remaining OOV words

In [35]:
# Print remaining OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

print(f"Number of OOV words left: {len(oov_words)}")

Number of OOV words left: 229


## Pass 4: Split words into subwords and average their subword embeddings

In [36]:
# Move split point from left to right of the OOV word
def find_subword(oov_word, glove_embeddings):
    # Usually words dont go beyond 3 splits, but we only try 2 splits, 3 splits might lose too much meaning
    # Different meaning: slappingly -> slap ping ly
    for i in range(1, len(oov_word)):
        left = oov_word[:i]
        right = oov_word[i:]
        
        if left in glove_embeddings and right in glove_embeddings.keys() and wn.synsets(left) and wn.synsets(right):
            return left, right
        
    return None

In [37]:
for oov_word in oov_words:
    subwords_tup = find_subword(oov_word, glove_embeddings)
    if subwords_tup is not None:
        print(f'OOV word: {oov_word}, subwords: {subwords_tup}')
        embeddings = [glove_embeddings[subword] for subword in subwords_tup]
        avg_embeddings = np.mean(embeddings, axis=0)
        embedding_matrix[vocab_word_to_index[oov_word]] = avg_embeddings

OOV word: landbound, subwords: ('land', 'bound')
OOV word: komediant, subwords: ('ko', 'mediant')
OOV word: interspliced, subwords: ('inter', 'spliced')
OOV word: wisegirls, subwords: ('wise', 'girls')
OOV word: unencouraging, subwords: ('un', 'encouraging')
OOV word: dogwalker, subwords: ('dog', 'walker')
OOV word: overmanipulative, subwords: ('over', 'manipulative')
OOV word: nutjob, subwords: ('nut', 'job')
OOV word: waydowntown, subwords: ('way', 'downtown')
OOV word: superada, subwords: ('super', 'ada')
OOV word: decasia, subwords: ('dec', 'asia')
OOV word: monkeyfun, subwords: ('monkey', 'fun')
OOV word: incomplacency, subwords: ('in', 'complacency')
OOV word: surehanded, subwords: ('sure', 'handed')
OOV word: unconned, subwords: ('un', 'conned')
OOV word: plushies, subwords: ('plush', 'ies')
OOV word: swashbuck, subwords: ('swash', 'buck')
OOV word: hobnail, subwords: ('hob', 'nail')
OOV word: timewaster, subwords: ('time', 'waster')
OOV word: pokepie, subwords: ('poke', 'pie')


# Remaining OOV words

In [38]:
# Print remaining OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)

print(f"Number of OOV words left: {len(oov_words)}")

Number of OOV words left: 177


In [39]:
oov_words

['aburrido',
 'evolução',
 'heremakono',
 'sophomoronic',
 'bazadona',
 'começamos',
 'enfrentados',
 'movilizador',
 'addessi',
 'contrária',
 'janklowicz',
 'direção',
 'higuchinsky',
 'dudsville',
 'diferença',
 'wewannour',
 'bierbichler',
 'pretenciosas',
 'inquestionável',
 'esteticamente',
 'thekids',
 'splatterfests',
 'necessidade',
 'preciosista',
 'prechewed',
 'orquídeas',
 'strainingly',
 'mediocridade',
 'projeção',
 'estafeta',
 'schneidermeister',
 'outing—the',
 'koshashvili',
 'artsploitation',
 'splittingly',
 'unrecommendable',
 'entretenida',
 'feardotcom',
 'revigorates',
 'desaponta',
 'nuttgens',
 'lástima',
 'esforço',
 'ryanovich',
 'emocionante',
 'silléified',
 "stalk'n'slash",
 'sorprenderá',
 'blutarsky',
 'moinoise',
 'responsável',
 'prejuicios',
 'sychowski',
 'unfakable',
 'retadora',
 'conmovedora',
 'kaputschnik',
 'talancón',
 'profundamente',
 'runteldat',
 'prewarned',
 'choquart',
 'gabbiest',
 'espetáculo',
 'entreter',
 'provocatuers',
 'try—bu

## Last pass: Replace OOV words with <UNK> token

In [40]:
# Get min max range of glove embeddings
all_embeddings = np.stack(list(glove_embeddings.values()))
min_val = np.min(all_embeddings)
max_val = np.max(all_embeddings)

print(f"min: {min_val}")
print(f"max: {max_val}")

mean_embedding = np.mean(all_embeddings)
std_embedding = np.std(all_embeddings)

print(f"Mean: {mean_embedding}, Standard Deviation: {std_embedding}")

min: -4.1831
max: 4.2165
Mean: -0.008671859820026773, Standard Deviation: 0.3818620572721229


In [41]:
unk_token = '<UNK>'
# Random embedding for <UNK> token
embedding_dim = 200
unk_embedding = np.random.uniform(-0.25, 0.25, embedding_dim)

# Assign <UNK> token embedding to OOV words
oov_words = get_oov_words(embedding_matrix, vocab_word_to_index)
for oov_word in oov_words:
    ## Use a single random embedding for all OOV words
    embedding_matrix[vocab_word_to_index[oov_word]] = unk_embedding
    ## Zero embedding
    # embedding_matrix[vocab_word_to_index[oov_word]] = np.zeros(embedding_dim)
    
# Add <UNK> token to vocab and embedding matrix
vocab_word_to_index[unk_token] = len(vocab)
embedding_matrix = np.vstack([embedding_matrix, unk_embedding])

In [42]:
# embedding_matrix[17841]

## Alternative methods to test
- semantic similarity with cosine similarity
- use model to predict oov embeddings Mimick https://github.com/yuvalpinter/Mimick


# Finally, save embedding matrix and vocab_to_index mapping

In [43]:
import pickle


with open('augmented_embedding_matrix_200d.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)
    
with open('augmented_vocab_word_to_index_200d.pkl', 'wb') as f:
    pickle.dump(vocab_word_to_index, f)