In [2]:
#!python DANmodels.py

In [3]:
# Torch and related PyTorch libraries
import torch
from torch import nn  # Neural networks
import torch.nn.functional as F  # Functional API
from torch.utils.data import Dataset, DataLoader  # Data utilities

# NLP Preprocessing Libraries (nltk)
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # Tokenizers
from nltk.corpus import stopwords  # Stop words
from nltk.stem import PorterStemmer  # Stemmer


# Utility and Helper Libraries
from typing import List  # Type hinting
import collections
from collections import Counter, defaultdict  # Counting elements
import re  # Regular expressions
import numpy as np  # Numerical computations
import time  # Time tracking
import argparse  # Argument parsing
import matplotlib.pyplot as plt  # Plotting
from torch.nn.utils.rnn import pad_sequence
import string


In [4]:
class SentimentExample:
    """
    Data wrapper for a single example for sentiment analysis.

    Attributes:
        words (List[string]): list of words
        label (int): 0 or 1 (0 = negative, 1 = positive)
    """

    def __init__(self, words, label):
        self.words = words
        self.label = label

    def __repr__(self):
        return repr(self.words) + "; label=" + repr(self.label)

    def __str__(self):
        return self.__repr__()

In [7]:
def read_sentiment_examples(infile: str) -> List[SentimentExample]:
    """
    Reads sentiment examples in the format [0 or 1]<TAB>[raw sentence]; tokenizes and cleans the sentences and forms
    SentimentExamples.

    Note that we lowercase the data for you. This is because the GloVe embeddings don't
    distinguish case and so can only be used with lowercasing.

    :param infile: file to read from
    :return: a list of SentimentExamples parsed from the file
    """
    f = open(infile)
    exs = []
    for line in f:
        if len(line.strip()) > 0:
            fields = line.split("\t")
            if len(fields) != 2:
                fields = line.split()
                label = 0 if "0" in fields[0] else 1
                sent = " ".join(fields[1:]).lower()
            else:
                # Slightly more robust to reading bad output than int(fields[0])
                label = 0 if "0" in fields[0] else 1
                sent = fields[1].lower()
            tokenized_cleaned_sent = list(filter(lambda x: x != '', sent.rstrip().split(" ")))
            exs.append(SentimentExample(tokenized_cleaned_sent, label))
    f.close()
    return exs

In [9]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
# Function to get symbol pairs and their frequencies
def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

# Function to merge the most frequent pair
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

# Number of merge operations
# num_merges = 10

def initialize_vocab(infile):
    vocab = defaultdict(int)
    exs = read_sentiment_examples(infile)
    for ex in exs:
        for word in ex.words:
            subword_format = ' '.join(list(word)) + ' </w>'
            vocab[subword_format] += 1
    return vocab

def perform_bpe_merges(num_merges, vocab): 
    for i in range(num_merges):
        pairs = get_stats(vocab)
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        #print(f'Merge {i + 1}: {best}')
    return vocab

def get_subword_vocab(bpe_vocab):
    subword_vocab = defaultdict(int)
    #Add the UNK token to the subword vocabulary
    subword_vocab["UNK"] = 0
    for word, count in bpe_vocab.items():
        # Remove the '</w>' marker
        subwords = word.replace('</w>', '').split()
        # Count each subword
        for subword in subwords:
            subword_vocab[subword] += count
    return subword_vocab

def build_bpe_subword_vocab(infile, merges):
    word_vocab = initialize_vocab(infile)
    pre_bpe_subword_vocab = get_subword_vocab(word_vocab)

    print('Before running BPE algorithm-------')
    print('\tWord-level vocab size: ', len(word_vocab))
    print('\tSubword-level vocab size: ', len(pre_bpe_subword_vocab))

    word_vocab = perform_bpe_merges(merges, word_vocab)
    post_bpe_subword_vocab = get_subword_vocab(word_vocab)

    print('After running BPE algorithm--------')
    print('\tWord-level vocab size: ', len(word_vocab))
    print('\tSubword-level vocab size: ', len(post_bpe_subword_vocab))
    print('\nCompression Ratio: ', len(word_vocab)/len(post_bpe_subword_vocab))

    return post_bpe_subword_vocab

def tokenize_bpe(text, vocab):
    # Tokenize text using BPE vocabulary
    words = text.split()
    tokenized_text = []

    # For each word in the list of words
    for word in words:
        subword = []
        i = 0
        # i index starts from beginning of word 
        while i < len(word):
            # Look for the longest subword in the vocabulary that matches the word prefix
            found = False
            #  and j index from the back.
            for j in range(len(word), i, -1):
                sub_token = word[i:j]
                if sub_token in vocab:
                    subword.append(sub_token)
                    i = j  # Move index past the subword
                    found = True
                    break
            if not found:
                subword.append(word[i])  # Add the character if no subword is found
                i += 1
                
        tokenized_text.append(' '.join(subword)) # Join as a sentence of subwords
    
    return ' '.join(tokenized_text) # return tokenized text as a sentence(s) of subwords

In [15]:
myvocab = build_bpe_subword_vocab('data/train.txt', 40000)

Before running BPE algorithm-------
	Word-level vocab size:  14830
	Subword-level vocab size:  65
After running BPE algorithm--------
	Word-level vocab size:  14830
	Subword-level vocab size:  10401

Compression Ratio:  1.4258244399576965


In [31]:
counter=0
for word, freq in myvocab.items():
    print(counter, word, freq)
    counter += 1

0 the 5998
1 rock 23
2 is 2266
3 dest 16
4 ined 33
5 to 2506
6 be 665
7 21 6
8 st 218
9 century 13
10 's 2025
11 new 149
12 `` 210
13 con 93
14 an 1163
15 '' 209
16 and 3889
17 that 1613
18 he 232
19 going 62
20 make 157
21 a 4766
22 spl 20
23 ash 74
24 even 227
25 great 103
26 er 408
27 than 400
28 ar 269
29 n 282
30 old 76
31 sch 66
32 war 77
33 z 148
34 ene 5
35 g 281
36 ger 47
37 , 5885
38 je 17
39 an- 17
40 cl 107
41 au 70
42 d 444
43 v 195
44 dam 17
45 me 160
46 or 535
47 steven 16
48 se 138
49 al 329
50 . 6610
51 gor 16
52 ge 103
53 ously 55
54 el 247
55 abor 12
56 ate 162
57 continu 7
58 ation 146
59 of 3658
60 l 270
61 ord 14
62 r 257
63 ings 64
64 tr 82
65 il 111
66 og 20
67 y 381
68 so 350
69 huge 15
70 col 32
71 um 126
72 words 20
73 can 225
74 not 460
75 adequ 10
76 ately 36
77 descri 12
78 co- 14
79 writer\/director 11
80 peter 13
81 jackson 17
82 exp 15
83 ed 490
84 vision 23
85 j 140
86 r. 6
87 tol 10
88 ki 26
89 en 246
90 middle- 13
91 earth 12
92 sing 35
93 er\/ 3
94 

In [7]:
!python DANmodels.py --model 1a

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Read in 14923 vectors of size 300
Read in 14923 vectors of size 300
Data loaded in : 1.0160441398620605 seconds

-----------------------------------------------------------------------------------------------------------

Comparing performance of Neural Networks with 2 fully connected layers, and embedding, and dropout layers

Using Negative log likelihood loss function with ReLu
Epoch #5: train accuracy 0.828, test accuracy 0.800
Epoch #10: train accuracy 0.899, test accuracy 0.814
Epoch #15: train accuracy 0.935, test accuracy 0.814
Epoch #20: train accuracy 0.962, test accuracy 0.798
Epoch #25: train accuracy 0.978, test accuracy 0.814
Epoch #30: train accuracy 0.988, test accuracy 0.800
Epoch #35: train accuracy 0.991, test accuracy 0.811
Epoch #40: train accuracy 0.993, test accuracy 0.804
Epoch #45: train accuracy 0.996, test accurac

In [11]:
!python DANmodels.py --model 1b

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

-----------------------------------------------------------------------------------------------------------

Comparing performance of the 3 Layer NN with fine-tuning pretrained GloVe embeddings vs training Random Embeddings from scratch
Read in 14923 vectors of size 300
Read in 14923 vectors of size 300
Data loaded in : 1.0373790264129639 seconds

Training and Evaluating a 3 Layer Neural Network on Pre-trained GloVe Embeddings
Epoch #5: train accuracy 0.862, test accuracy 0.813
Epoch #10: train accuracy 0.927, test accuracy 0.817
Epoch #15: train accuracy 0.957, test accuracy 0.811
Epoch #20: train accuracy 0.967, test accuracy 0.808
Epoch #25: train accuracy 0.970, test accuracy 0.811
Epoch #30: train accuracy 0.973, test accuracy 0.799
Epoch #35: train accuracy 0.974, test accuracy 0.799
Epoch #40: train accuracy 0.976, test accuracy 0.

In [13]:
!python DANmodels.py --model 2a

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

-----------------------------------------------------------------------------------------------------------
Deep Averaging Network (DAN) with subword tokenization using Byte Pair Encoding:
----Loading Training Data----
Before running BPE algorithm-------
	Word-level vocab size:  14830
	Subword-level vocab size:  64
After running BPE algorithm--------
	Word-level vocab size:  14830
	Subword-level vocab size:  471

Compression Ratio:  31.48619957537155
----Loading Test Data--------
Before running BPE algorithm-------
	Word-level vocab size:  4339
	Subword-level vocab size:  53
After running BPE algorithm--------
	Word-level vocab size:  4339
	Subword-level vocab size:  464

Compression Ratio:  9.351293103448276

Data loaded in : 12.588464736938477 seconds

Training (from scratch) and Evaluating a 3 Layer Neural Network using BPE Subword-Lev