In [2]:
#!python DANmodels.py

In [3]:
# Torch and related PyTorch libraries
import torch
from torch import nn  # Neural networks
import torch.nn.functional as F  # Functional API
from torch.utils.data import Dataset, DataLoader  # Data utilities

# NLP Preprocessing Libraries (nltk)
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # Tokenizers
from nltk.corpus import stopwords  # Stop words
from nltk.stem import PorterStemmer  # Stemmer


# Utility and Helper Libraries
from typing import List  # Type hinting
import collections
from collections import Counter, defaultdict  # Counting elements
import re  # Regular expressions
import numpy as np  # Numerical computations
import time  # Time tracking
import argparse  # Argument parsing
import matplotlib.pyplot as plt  # Plotting
from torch.nn.utils.rnn import pad_sequence
import string


In [4]:
class SentimentExample:
    """
    Data wrapper for a single example for sentiment analysis.

    Attributes:
        words (List[string]): list of words
        label (int): 0 or 1 (0 = negative, 1 = positive)
    """

    def __init__(self, words, label):
        self.words = words
        self.label = label

    def __repr__(self):
        return repr(self.words) + "; label=" + repr(self.label)

    def __str__(self):
        return self.__repr__()

In [5]:
def read_sentiment_examples(infile: str) -> List[SentimentExample]:
    """
    Reads sentiment examples in the format [0 or 1]<TAB>[raw sentence]; tokenizes and cleans the sentences and forms
    SentimentExamples.

    Note that we lowercase the data for you. This is because the GloVe embeddings don't
    distinguish case and so can only be used with lowercasing.

    :param infile: file to read from
    :return: a list of SentimentExamples parsed from the file
    """
    f = open(infile)
    exs = []
    for line in f:
        if len(line.strip()) > 0:
            fields = line.split("\t")
            if len(fields) != 2:
                fields = line.split()
                label = 0 if "0" in fields[0] else 1
                sent = " ".join(fields[1:]).lower()
            else:
                # Slightly more robust to reading bad output than int(fields[0])
                label = 0 if "0" in fields[0] else 1
                sent = fields[1].lower()
            tokenized_cleaned_sent = list(filter(lambda x: x != '', sent.rstrip().split(" ")))
            exs.append(SentimentExample(tokenized_cleaned_sent, label))
    f.close()
    return exs

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [101]:
# Function to get symbol pairs and their frequencies
def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

# Function to merge the most frequent pair
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

# Number of merge operations
# num_merges = 10

def initialize_vocab(infile):
    vocab = defaultdict(int)
    exs = read_sentiment_examples(infile)
    for ex in exs:
        for word in ex.words:
            subword_format = ' '.join(list(word)) + ' </w>'
            vocab[subword_format] += 1
    return vocab

def perform_bpe_merges(num_merges, vocab): 
    for i in range(num_merges):
        pairs = get_stats(vocab)
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        #print(f'Merge {i + 1}: {best}')
    return vocab

def get_subword_vocab(bpe_vocab):
    subword_vocab = defaultdict(int)
    for word, count in bpe_vocab.items():
        # Remove the '</w>' marker
        subwords = word.replace('</w>', '').split()
        # Count each subword
        for subword in subwords:
            subword_vocab[subword] += count
    return subword_vocab

def build_bpe_subword_vocab(infile, merges):
    word_vocab = initialize_vocab(infile)
    pre_bpe_subword_vocab = get_subword_vocab(word_vocab)

    print('Before running BPE algorithm-')
    print('Word-level vocab size: ', len(word_vocab))
    print('Subword-level vocab size: ', len(pre_bpe_subword_vocab))

    word_vocab = perform_bpe_merges(merges, word_vocab)
    post_bpe_subword_vocab = get_subword_vocab(word_vocab)

    print('After running BPE algorithm-')
    print('Word-level vocab size: ', len(word_vocab))
    print('Subword-level vocab size: ', len(post_bpe_subword_vocab))
    print('Compression Ratio: ', len(word_vocab)/len(post_bpe_subword_vocab))

    return post_bpe_subword_vocab

def tokenize_bpe(text, vocab):
    # Tokenize text using BPE vocabulary
    words = text.split()
    tokenized_text = []

    # For each word in the list of words
    for word in words:
        subword = []
        i = 0
        # i index starts from beginning of word 
        while i < len(word):
            # Look for the longest subword in the vocabulary that matches the word prefix
            found = False
            #  and j index from the back.
            for j in range(len(word), i, -1):
                sub_token = word[i:j]
                if sub_token in vocab:
                    subword.append(sub_token)
                    i = j  # Move index past the subword
                    found = True
                    break
            if not found:
                subword.append(word[i])  # Add the character if no subword is found
                i += 1
                
        tokenized_text.append(' '.join(subword)) # Join as a sentence of subwords
    
    return ' '.join(tokenized_text) # return tokenized text as a sentence(s) of subwords

In [103]:
vocab = build_bpe_subword_vocab('data/train.txt', 500)
for word, freq in vocab.items():
    print(word, freq)

Before running BPE algorithm-
Word-level vocab size:  14830
Subword-level vocab size:  64
After running BPE algorithm-
Word-level vocab size:  14830
Subword-level vocab size:  471
Compression Ratio:  31.48619957537155
the 6209
r 1367
oc 525
k 2398
is 3140
de 1008
st 1995
in 4213
ed 2496
to 2687
be 1281
2 118
1 177
c 3291
ent 1146
ur 500
y 3875
's 2025
new 133
`` 210
con 660
an 2994
'' 209
and 4288
that 1613
he 324
go 161
ing 2968
make 176
a 7236
s 5961
pl 953
as 2145
h 2652
even 260
gre 196
at 1598
er 3580
than 400
ar 1936
n 2214
old 168
ch 1699
w 1740
z 460
en 2690
e 4240
g 2410
, 5885
j 605
- 2035
cl 363
au 496
d 4220
v 1578
am 802
me 317
or 2050
se 1061
al 2440
. 6732
ge 496
ous 491
ly 1255
el 1447
ab 333
ate 293
t 4144
u 1761
ation 323
of 3952
l 2906
ings 165
tr 449
il 828
og 164
so 420
ol 692
um 366
wor 415
ds 376
can 204
not 439
qu 596
ately 141
des 193
ri 839
o 2396
wr 227
it 2984
\ 87
/ 70
director 191
pe 264
ter 412
ack 318
on 2176
ex 902
p 3138
vis 145
i 2910
m 3777
id 364
le