In [1]:
import nltk
from nltk.corpus import movie_reviews

In [None]:
nltk.download("punkt")
nltk.download("movie_reviews")
nltk.download("subjectivity")
nltk.download("stopwords")
nltk.download("sentiwordnet")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Subjectivity data exploration

In [17]:
from nltk.corpus import subjectivity


subj = [sent for sent in subjectivity.sents(categories = 'subj')]
obj = [sent for sent in subjectivity.sents(categories = 'obj')]
corpus = subj + obj

In [18]:
print(subj[0])
print(obj[0])

['smart',
 'and',
 'alert',
 ',',
 'thirteen',
 'conversations',
 'about',
 'one',
 'thing',
 'is',
 'a',
 'small',
 'gem',
 '.']

In [19]:
print(len(obj))
print(len(subj))

5000
5000


In [20]:
def compute_average_sentence_length(corpus):
    lengths = []
    for sent in corpus:
        lengths.append(len(sent))
    return sum(lengths)/len(lengths)

compute_average_sentence_length(corpus)

24.0576

In [None]:
def create_vocab(corpus):
  corpus_words = get_corpus_words(corpus)
  vocab = dict()
  for word in corpus_words:
    try:
      vocab[word] += 1
    except:
      vocab[word] = 1
  return vocab

def get_corpus_words(corpus):
    return [w for sent in corpus for w in sent]

In [None]:
import operator
from tqdm import tqdm
from torchtext.vocab import GloVe
from torchtext.vocab import FastText
import torch

# function inspired by https://www.kaggle.com/code/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    null_embedding = torch.tensor([0.0]*300)
    for word in tqdm(vocab):
        try:
          if torch.equal(embeddings_index.get_vecs_by_tokens(word), null_embedding):
            raise KeyError
          a[word] = embeddings_index.get_vecs_by_tokens(word)
          k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print()
    print(f'Found embeddings for {len(a) / len(vocab):.2%} of vocab')
    print(f'Found embeddings for  {k / (k + i):.2%} of all text')
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [None]:
global_vectors = GloVe(name='840B', dim=300, cache = "/content/gdrive/My Drive/nlu-project/Embeddings/.vector_cache")
vocab = create_vocab(corpus)
oov = check_coverage(vocab, global_vectors)

In [None]:
fast_text = FastText('en', cache = "/content/gdrive/My Drive/nlu-project/Embeddings/.vector_cache")

# Polarity data exploration

In [None]:
mr = movie_reviews
neg = mr.paras(categories = "neg")
pos = mr.paras(categories = "pos")
print(f"length of each part of the dataset:\n - pos: {len(pos)} \n - neg: {len(neg)}\n")
print(pos[0])
corpus = pos + neg

In [15]:
def compute_average_document_length(corpus):
    lengths = []
    for doc in corpus:
        lengths.append(len(doc))
    return sum(lengths)/len(lengths)

compute_average_document_length(corpus)

32.629

In [16]:
def compute_average_sentence_length(corpus):
    lengths = []
    for doc in corpus:
        for sent in doc:
            lengths.append(len(sent))
    return sum(lengths)/len(lengths)

compute_average_sentence_length(corpus)

24.270127800422937

In [None]:
# redefinition to account for documents
def get_corpus_words(corpus):
    return [w for doc in corpus for sent in doc for w in sent]

### Checking coverage of the word embedding

In [None]:
vocab = create_vocab(corpus)
oov = check_coverage(vocab, global_vectors)

In [None]:
def remove_underscores(corpus):
  for doc in corpus:
    for sent in doc:
      for idx, word in enumerate(sent):
        if "_" in word:
          cleaned_word = _clean_word(word)
          sent[idx] = cleaned_word
  return corpus


def _clean_word(word: str):
  word = word.replace("_", " ")
  word = word.split()
  return word

In [None]:
clean_corpus = remove_underscores(corpus)
vocab = create_vocab(get_corpus_words(clean_corpus))
oov = check_coverage(vocab, global_vectors)