In [1]:
from google.colab import drive
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [2]:
import sys
sys.path.append("/content/gdrive/My Drive/nlu-project")

In [3]:
import nltk
from nltk.corpus import movie_reviews

In [47]:
nltk.download("punkt")
nltk.download("movie_reviews")
nltk.download("subjectivity")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


True

# Subjectivity data exploration

In [23]:
from nltk.corpus import subjectivity


subj = [sent for sent in subjectivity.sents(categories = 'subj')]
obj = [sent for sent in subjectivity.sents(categories = 'obj')]
corpus = subj + obj

In [24]:
print(subj[0])
print(obj[0])

['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.']
['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a', 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi', 'from', 'a', 'hunter', '.']


In [25]:
print(len(obj))
print(len(subj))

5000
5000


In [26]:
def compute_average_sentence_length(corpus):
    lengths = []
    for sent in corpus:
        lengths.append(len(sent))
    return sum(lengths)/len(lengths)

compute_average_sentence_length(corpus)

24.0576

In [27]:
def create_vocab(corpus):
  corpus_words = get_corpus_words(corpus)
  vocab = dict()
  for word in corpus_words:
    try:
      vocab[word] += 1
    except:
      vocab[word] = 1
  return vocab

def get_corpus_words(corpus):
    return [w for sent in corpus for w in sent]

In [28]:
import operator
from tqdm import tqdm
from torchtext.vocab import GloVe
from torchtext.vocab import FastText
import torch

# function inspired by https://www.kaggle.com/code/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    null_embedding = torch.tensor([0.0]*300)
    for word in tqdm(vocab):
        try:
          if torch.equal(embeddings_index.get_vecs_by_tokens(word), null_embedding):
            raise KeyError
          a[word] = embeddings_index.get_vecs_by_tokens(word)
          k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print()
    print(f'Found embeddings for {len(a) / len(vocab):.2%} of vocab')
    print(f'Found embeddings for  {k / (k + i):.2%} of all text')
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [None]:
# Change with your path if you have already downloaded embeddings
fast_text = FastText('en', cache = "/content/gdrive/My Drive/nlu-project/Embeddings/.vector_cache")
global_vectors = GloVe(name='840B', dim=300, cache = "/content/gdrive/My Drive/nlu-project/Embeddings/.vector_cache")

In [29]:
vocab = create_vocab(corpus)
oov = check_coverage(vocab, global_vectors)
print(oov)

100%|██████████| 23906/23906 [00:03<00:00, 6621.54it/s]


Found embeddings for 85.93% of vocab
Found embeddings for  97.65% of all text
[("there's", 128), ("he's", 127), ("isn't", 124), ("film's", 71), ("she's", 61), ("won't", 41), ("they're", 37), ("what's", 34), ('&#38', 32), ("who's", 32), ("you've", 30), ("movie's", 30), ("man's", 28), ("world's", 28), ("i'm", 26), ("'the", 25), ("you'd", 23), ("father's", 22), ("family's", 21), ("hasn't", 20), ("couldn't", 19), ("year's", 17), ("mother's", 16), ("aren't", 15), ("women's", 14), ("city's", 13), ("they've", 13), ('Â\x96', 13), ("wouldn't", 12), ("woman's", 12), ("we've", 12), ("children's", 11), ("life's", 11), ("haven't", 11), ("director's", 11), ("one's", 11), ('fianc&#233', 10), ("everyone's", 10), ("america's", 10), ("wasn't", 10), ("i've", 10), ('[the', 10), ("shouldn't", 10), ('rupi', 9), ("he'll", 9), ("filmmaker's", 9), ("we're", 9), ("york's", 8), ("disney's", 8), ('clichÃ©s', 8), ("today's", 8), ('windtalkers', 8), ('t&#252', 7), ('ghisu', 7), ("writer's", 7), ("i'll", 7), ("char




In [30]:
oov = check_coverage(vocab, fast_text)
print(oov)

100%|██████████| 23906/23906 [00:01<00:00, 19167.46it/s]


Found embeddings for 81.73% of vocab
Found embeddings for  95.94% of all text
[("it's", 636), ('--', 345), (';', 344), (':', 253), ("doesn't", 176), ("there's", 128), ("he's", 127), ("isn't", 124), ("can't", 111), ("that's", 104), ("don't", 85), ("film's", 71), ("you're", 67), ("she's", 61), ("you'll", 46), ("won't", 41), ("they're", 37), ("what's", 34), ('&#38', 32), ("who's", 32), ("didn't", 32), ("you've", 30), ("movie's", 30), ('2', 29), ("man's", 28), ("world's", 28), ("i'm", 26), ("'the", 25), ('10', 23), ("you'd", 23), ('writer-director', 23), ("father's", 22), ("family's", 21), ('2002', 21), ("hasn't", 20), ('000', 19), ('coming-of-age', 19), ("couldn't", 19), ("year's", 17), ('20', 17), ("mother's", 16), ('*', 15), ("aren't", 15), ('30', 14), ('11', 14), ('thought-provoking', 14), ("women's", 14), ("city's", 13), ("they've", 13), ('5', 13), ('90', 13), ('old-fashioned', 13), ('Â\x96', 13), ('12', 12), ('2001', 12), ("wouldn't", 12), ("woman's", 12), ("we've", 12), ("children'




It can be seen that a lot of out-of-vocabulary words are the ones having contractions. So I am going to remove contractions using a pre-determined contraction map correspondance:

In [43]:
CONTRACTION_MAP =  {"ain't": "is not",
                        "aren't": "are not",
                        "can't": "cannot",
                        "can't've": "cannot have",
                        "'cause": "because",
                        "could've": "could have",
                        "couldn't": "could not",
                        "couldn't've": "could not have",
                        "didn't": "did not",
                        "doesn't": "does not",
                        "don't": "do not",
                        "hadn't": "had not",
                        "hadn't've": "had not have",
                        "hasn't": "has not",
                        "haven't": "have not",
                        "he'd": "he would",
                        "he'd've": "he would have",
                        "he'll": "he will",
                        "he'll've": "he he will have",
                        "he's": "he is",
                        "how'd": "how did",
                        "how'd'y": "how do you",
                        "how'll": "how will",
                        "how's": "how is",
                        "i'd": "i would",
                        "i'd've": "i would have",
                        "i'll": "i will",
                        "i'll've": "i will have",
                        "i'm": "i am",
                        "i've": "i have",
                        "isn't": "is not",
                        "it'd": "it would",
                        "it'd've": "it would have",
                        "it'll": "it will",
                        "it'll've": "it will have",
                        "it's": "it is",
                        "let's": "let us",
                        "ma'am": "madam",
                        "mayn't": "may not",
                        "might've": "might have",
                        "mightn't": "might not",
                        "mightn't've": "might not have",
                        "must've": "must have",
                        "mustn't": "must not",
                        "mustn't've": "must not have",
                        "needn't": "need not",
                        "needn't've": "need not have",
                        "o'clock": "of the clock",
                        "oughtn't": "ought not",
                        "oughtn't've": "ought not have",
                        "shan't": "shall not",
                        "sha'n't": "shall not",
                        "shan't've": "shall not have",
                        "she'd": "she would",
                        "she'd've": "she would have",
                        "she'll": "she will",
                        "she'll've": "she will have",
                        "she's": "she is",
                        "should've": "should have",
                        "shouldn't": "should not",
                        "shouldn't've": "should not have",
                        "so've": "so have",
                        "so's": "so as",
                        "that'd": "that would",
                        "that'd've": "that would have",
                        "that's": "that is",
                        "there'd": "there would",
                        "there'd've": "there would have",
                        "there's": "there is",
                        "they'd": "they would",
                        "they'd've": "they would have",
                        "they'll": "they will",
                        "they'll've": "they will have",
                        "they're": "they are",
                        "they've": "they have",
                        "to've": "to have",
                        "wasn't": "was not",
                        "we'd": "we would",
                        "we'd've": "we would have",
                        "we'll": "we will",
                        "we'll've": "we will have",
                        "we're": "we are",
                        "we've": "we have",
                        "weren't": "were not",
                        "what'll": "what will",
                        "what'll've": "what will have",
                        "what're": "what are",
                        "what's": "what is",
                        "what've": "what have",
                        "when's": "when is",
                        "when've": "when have",
                        "where'd": "where did",
                        "where's": "where is",
                        "where've": "where have",
                        "who'll": "who will",
                        "who'll've": "who will have",
                        "who's": "who is",
                        "who've": "who have",
                        "why's": "why is",
                        "why've": "why have",
                        "will've": "will have",
                        "won't": "will not",
                        "won't've": "will not have",
                        "would've": "would have",
                        "wouldn't": "would not",
                        "wouldn't've": "would not have",
                        "y'all": "you all",
                        "y'all'd": "you all would",
                        "y'all'd've": "you all would have",
                        "y'all're": "you all are",
                        "y'all've": "you all have",
                        "you'd": "you would",
                        "you'd've": "you would have",
                        "you'll": "you will",
                        "you'll've": "you will have",
                        "you're": "you are",
                        "you've": "you have",
                    }

def clean_contractions(corpus):
  """
  Parameters
  ----------
  corpus : list of list of list

  Returns
  -------
  list of list
    Formatted text where contractions are merged into one single word

  """
  new_corpus = []
  for doc in corpus:
    new_doc = []
    for word in doc:
      try:
          correct = CONTRACTION_MAP[word]
          correct = correct.split()
          new_doc += correct
      except:
          new_doc.append(word)
    new_corpus.append(new_doc)
  return new_corpus


In [44]:
corpus = clean_contractions(corpus)
vocab = create_vocab(corpus)
oov = check_coverage(vocab, global_vectors)
print(oov)

100%|██████████| 23852/23852 [00:01<00:00, 20416.92it/s]



Found embeddings for 86.09% of vocab
Found embeddings for  98.03% of all text
[("film's", 71), ('&#38', 32), ("movie's", 30), ("man's", 28), ("world's", 28), ("'the", 25), ("father's", 22), ("family's", 21), ("year's", 17), ("mother's", 16), ("women's", 14), ("city's", 13), ('Â\x96', 13), ("woman's", 12), ("children's", 11), ("life's", 11), ("director's", 11), ("one's", 11), ('fianc&#233', 10), ("everyone's", 10), ("america's", 10), ('[the', 10), ('rupi', 9), ("filmmaker's", 9), ("york's", 8), ("disney's", 8), ('clichÃ©s', 8), ("today's", 8), ('windtalkers', 8), ('t&#252', 7), ('ghisu', 7), ("writer's", 7), ("characters'", 7), ('clichÃ©', 7), ("nete's", 6), ("jack's", 6), ('egoyan', 6), ("here's", 6), ("girl's", 6), ('Â\x97', 6), ('nÃ£o', 6), ('chabrol', 6), ("character's", 6), ("soderbergh's", 6), ('is]', 6), ("son's", 6), ("hoffman's", 6), ('nickleby', 6), ('waydowntown', 6), ("moore's", 6), ("sarah's", 5), ("tiz's", 5), ("1970's", 5), ('barrillo', 5), ("wife's", 5), ('t-mon', 5), (

In [45]:
# Fast text
oov = check_coverage(vocab, fast_text)
print(oov)

100%|██████████| 23852/23852 [00:01<00:00, 17438.05it/s]



Found embeddings for 81.91% of vocab
Found embeddings for  96.85% of all text
[('--', 345), (';', 344), (':', 253), ("film's", 71), ('&#38', 32), ("movie's", 30), ('2', 29), ("man's", 28), ("world's", 28), ("'the", 25), ('10', 23), ('writer-director', 23), ("father's", 22), ("family's", 21), ('2002', 21), ('000', 19), ('coming-of-age', 19), ("year's", 17), ('20', 17), ("mother's", 16), ('*', 15), ('30', 14), ('11', 14), ('thought-provoking', 14), ("women's", 14), ("city's", 13), ('5', 13), ('90', 13), ('old-fashioned', 13), ('Â\x96', 13), ('12', 12), ('2001', 12), ("woman's", 12), ("children's", 11), ("life's", 11), ("director's", 11), ("'70s", 11), ("one's", 11), ('fianc&#233', 10), ("everyone's", 10), ("america's", 10), ('15', 10), ('well-acted', 10), ('real-life', 10), ('[the', 10), ('sci-fi', 10), ('hip-hop', 10), ("filmmaker's", 9), ('low-budget', 9), ('middle-aged', 9), ('spider-man', 9), ("york's", 8), ('40', 8), ("'s", 8), ('80', 8), ('1', 8), ("disney's", 8), ('clichÃ©s', 8),

A slight improvement can be appreciated, but a lot of words cannot be separated by the contraction because they are not inside the contraction map dictionary and are followed by "'s". This cannot be easly separated because "'s" in english can have different meaning (e.g. is or has?).

# Polarity data exploration

In [48]:
mr = movie_reviews
neg = mr.paras(categories = "neg")
pos = mr.paras(categories = "pos")
print(f"length of each part of the dataset:\n - pos: {len(pos)} \n - neg: {len(neg)}\n")
print(pos[0])
corpus = pos + neg

length of each part of the dataset:
 - pos: 1000 
 - neg: 1000

[['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'", 're', 'about', 'superheroes', '(', 'batman', ',', 'superman', ',', 'spawn', ')', ',', 'or', 'geared', 'toward', 'kids', '(', 'casper', ')', 'or', 'the', 'arthouse', 'crowd', '(', 'ghost', 'world', ')', ',', 'but', 'there', "'", 's', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', '.'], ['for', 'starters', ',', 'it', 'was', 'created', 'by', 'alan', 'moore', '(', 'and', 'eddie', 'campbell', ')', ',', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', "'", '80s', 'with', 'a', '12', '-', 'part', 'series', 'called', 'the', 'watchmen', '.'], ['to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'l

In [49]:
def compute_average_document_length(corpus):
    lengths = []
    for doc in corpus:
        lengths.append(len(doc))
    return sum(lengths)/len(lengths)

compute_average_document_length(corpus)

32.629

In [50]:
def compute_average_sentence_length(corpus):
    lengths = []
    for doc in corpus:
        for sent in doc:
            lengths.append(len(sent))
    return sum(lengths)/len(lengths)

compute_average_sentence_length(corpus)

24.270127800422937

In [51]:
# redefinition to account for documents
def get_corpus_words(corpus):
    return [w for doc in corpus for sent in doc for w in sent]

### Checking coverage of the word embedding

In [52]:
vocab = create_vocab(corpus)
oov = check_coverage(vocab, global_vectors)
print(oov)
print([w[0] for w in oov if "_" in w[0]])

100%|██████████| 39768/39768 [00:01<00:00, 20035.63it/s]



Found embeddings for 91.93% of vocab
Found embeddings for  99.58% of all text
['_the', 'valentine_', '_54_', '_shaft_', '_is_', '_pollock_', '_scream_', '_really_', '_election_', '_vampires_', 'ryan_', '_saving', '_knock_off_', 'floor_', '_beloved_', '_not_', '_rushmore_', '_daylight_', '_real_', '_soldier_', '_dirty_work_', '_polish_wedding_', '_double_team_', '_last_', '_and_', 'shell_', '_ghost', '_roxbury_', 'samurai_', '_saturday_night_live_', '_more_', '_babe_', '_armageddon_', '_cliffhanger_', '_does_', '_do_', 'runner_', '_gattaca_', '_that_', '_in', 'movie_', '_patlabor', '_onegin_', 'sky_', '_october', '_jerry_maguire_', '_a_night_at_the_roxbury_', '_dead_man_', '_dead_man_on_campus_', '_blade_', 'vampires_', '_john', '_the_', 'floats_', '_hope', '_i_know', '_urban_legend_', '_breakfast_of_champions_', 't_', '_don', 'come_', '_what', '_must_', 'legend_', '_urban', '_blade', 'matrix_', '_unbreakable_', 'central_', '_film', '_seven_nights_', '_six_days', '_brazil_', 'beautiful

In [61]:
oov = check_coverage(vocab, fast_text)
print(oov)
print([w[0] for w in oov if "_" in w[0]])

100%|██████████| 39768/39768 [00:01<00:00, 27768.84it/s]



Found embeddings for 95.11% of vocab
Found embeddings for  98.94% of all text
['valentine_', '_54_', '_shaft_', '_pollock_', '_scream_', '_election_', '_vampires_', 'ryan_', '_saving', '_knock_off_', 'floor_', '_beloved_', '_rushmore_', '_daylight_', '_soldier_', '_dirty_work_', '_polish_wedding_', '_double_team_', '_last_', 'shell_', '_ghost', '_roxbury_', 'samurai_', '_saturday_night_live_', '_babe_', '_armageddon_', '_cliffhanger_', 'runner_', '_gattaca_', 'movie_', '_patlabor', '_onegin_', 'sky_', '_jerry_maguire_', '_a_night_at_the_roxbury_', '_dead_man_', '_dead_man_on_campus_', '_blade_', 'vampires_', 'floats_', '_hope', '_i_know', '_urban_legend_', '_breakfast_of_champions_', 'come_', 'legend_', '_urban', '_blade', '_unbreakable_', 'central_', '_seven_nights_', '_six_days', '_brazil_', 'beautiful_', '_21_jump_street_', '_clueless_', '_snl_', '_saved_by_the_bell_', '_pecker_', '_four_', 'york_', '_escape', '_they', 'darkness_', 'thing_', '_halloween_', '_there_', 'kombat_', '_m

### Removing underscores to improve coverage

Since there are a lot of common words that for some reson are surrunded by underscores, they are not recognized by the "word-embedding lookup table". One example is the word "\_scream\_" which is seen as out-of-vocabulary but indeed it is a know word and underscores don't add any information to it, so I can write a function to get rid of underscores in these words.

In [54]:
def remove_underscores(corpus):
  for doc in corpus:
    for sent in doc:
      for idx, word in enumerate(sent):
        if "_" in word:
          cleaned_word = _clean_word(word)
          if not type(cleaned_word) == list:
            sent[idx] = cleaned_word
  return corpus


def _clean_word(word: str):
  word = word.replace("_", " ")
  # In the case there are multiple words into a single sentence
  word = word.split()
  return word

In [59]:
clean_corpus = remove_underscores(corpus)
new_vocab = create_vocab(get_corpus_words(clean_corpus))
oov = check_coverage(new_vocab, global_vectors)
print(oov)

100%|██████████| 72/72 [00:00<00:00, 20544.93it/s]


Found embeddings for 93.06% of vocab
Found embeddings for  100.00% of all text
[('\x12', 44), ('\x16', 12), ('\x14', 7), ('\x13', 7), ('\x05', 6)]





In [60]:
oov = check_coverage(new_vocab, fast_text)
print(oov)

100%|██████████| 72/72 [00:00<00:00, 29127.11it/s]


Found embeddings for 69.44% of vocab
Found embeddings for  99.65% of all text
[('1', 3288), (':', 3042), ('0', 2928), ('9', 2754), (';', 1850), ('2', 1308), ('*', 1054), ('8', 935), ('7', 815), ('3', 805), ('5', 798), ('4', 649), ('6', 586), ('=', 559), (']', 90), ('[', 90), ('\x12', 44), ('\x16', 12), ('\x14', 7), ('\x13', 7), ('\x05', 6), ('|', 5)]





An improvemente of text coverage can be appreciated in both cases.