In [3]:
!python DANmodels.py --model 2a

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

-----------------------------------------------------------------------------------------------------------
Deep Averaging Network (DAN) with subword tokenization using Byte Pair Encoding:
----Loading Training Data----
Before running BPE algorithm-------
	Word-level vocab size:  14830
	Subword-level vocab size:  64
After running BPE algorithm--------
	Word-level vocab size:  14830
	Subword-level vocab size:  1824

Compression Ratio:  8.130482456140351
----Loading Test Data--------
Before running BPE algorithm-------
	Word-level vocab size:  4339
	Subword-level vocab size:  53
After running BPE algorithm--------
	Word-level vocab size:  4339
	Subword-level vocab size:  1747

Compression Ratio:  2.483686319404694

Data loaded in : 43.679264068603516 seconds

Training (from scratch) and Evaluating a 3 Layer Neural Network using BPE Subword-L

In [16]:
# Torch and related PyTorch libraries
import torch
from torch import nn  # Neural networks
import torch.nn.functional as F  # Functional API
from torch.utils.data import Dataset, DataLoader  # Data utilities

# NLP Preprocessing Libraries (nltk)
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # Tokenizers
from nltk.corpus import stopwords  # Stop words
from nltk.stem import PorterStemmer  # Stemmer


# Utility and Helper Libraries
from typing import List  # Type hinting
import collections
from collections import Counter, defaultdict  # Counting elements
import re  # Regular expressions
import numpy as np  # Numerical computations
import time  # Time tracking
import argparse  # Argument parsing
import matplotlib.pyplot as plt  # Plotting
from torch.nn.utils.rnn import pad_sequence
import string


In [17]:
class SentimentExample:
    """
    Data wrapper for a single example for sentiment analysis.

    Attributes:
        words (List[string]): list of words
        label (int): 0 or 1 (0 = negative, 1 = positive)
    """

    def __init__(self, words, label):
        self.words = words
        self.label = label

    def __repr__(self):
        return repr(self.words) + "; label=" + repr(self.label)

    def __str__(self):
        return self.__repr__()

In [20]:
def read_sentiment_examples(infile: str) -> List[SentimentExample]:
    """
    Reads sentiment examples in the format [0 or 1]<TAB>[raw sentence]; tokenizes and cleans the sentences and forms
    SentimentExamples.

    Note that we lowercase the data for you. This is because the GloVe embeddings don't
    distinguish case and so can only be used with lowercasing.

    :param infile: file to read from
    :return: a list of SentimentExamples parsed from the file
    """
    f = open(infile)
    exs = []
    for line in f:
        if len(line.strip()) > 0:
            fields = line.split("\t")
            if len(fields) != 2:
                fields = line.split()
                label = 0 if "0" in fields[0] else 1
                sent = " ".join(fields[1:]).lower()
            else:
                # Slightly more robust to reading bad output than int(fields[0])
                label = 0 if "0" in fields[0] else 1
                sent = fields[1].lower()
            tokenized_cleaned_sent = list(filter(lambda x: x != '', sent.rstrip().split(" ")))
            exs.append(SentimentExample(tokenized_cleaned_sent, label))
    f.close()
    return exs

In [22]:
# Function to get symbol pairs and their frequencies
def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

# Function to merge the most frequent pair
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

# Number of merge operations
# num_merges = 10

def initialize_vocab(infile):
    vocab = defaultdict(int)
    exs = read_sentiment_examples(infile)
    for ex in exs:
        for word in ex.words:
            subword_format = ' '.join(list(word)) + ' </w>'
            vocab[subword_format] += 1
    return vocab

def perform_bpe_merges(num_merges, vocab): 
    for i in range(num_merges):
        pairs = get_stats(vocab)
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        #print(f'Merge {i + 1}: {best}')
    return vocab

def get_subword_vocab(bpe_vocab):
    subword_vocab = defaultdict(int)
    for word, count in bpe_vocab.items():
        # Remove the '</w>' marker
        subwords = word.replace('</w>', '').split()
        # Count each subword
        for subword in subwords:
            subword_vocab[subword] += count
    return subword_vocab

def build_bpe_subword_vocab(infile, merges):
    word_vocab = initialize_vocab(infile)
    pre_bpe_subword_vocab = get_subword_vocab(word_vocab)

    print('Before running BPE algorithm-')
    print('Word-level vocab size: ', len(word_vocab))
    print('Subword-level vocab size: ', len(pre_bpe_subword_vocab))

    word_vocab = perform_bpe_merges(merges, word_vocab)
    post_bpe_subword_vocab = get_subword_vocab(word_vocab)

    print('After running BPE algorithm-')
    print('Word-level vocab size: ', len(word_vocab))
    print('Subword-level vocab size: ', len(post_bpe_subword_vocab))
    print('Compression Ratio: ', len(word_vocab)/len(post_bpe_subword_vocab))

    return post_bpe_subword_vocab

def tokenize_bpe(text, vocab):
    # Tokenize text using BPE vocabulary
    words = text.split()
    tokenized_text = []

    # For each word in the list of words
    for word in words:
        subword = []
        i = 0
        # i index starts from beginning of word 
        while i < len(word):
            # Look for the longest subword in the vocabulary that matches the word prefix
            found = False
            #  and j index from the back.
            for j in range(len(word), i, -1):
                sub_token = word[i:j]
                if sub_token in vocab:
                    subword.append(sub_token)
                    i = j  # Move index past the subword
                    found = True
                    break
            if not found:
                subword.append(word[i])  # Add the character if no subword is found
                i += 1
                
        tokenized_text.append(' '.join(subword)) # Join as a sentence of subwords
    
    return ' '.join(tokenized_text) # return tokenized text as a sentence(s) of subwords

In [59]:
myvocab = build_bpe_subword_vocab('data/train.txt', 5000)

Before running BPE algorithm-
Word-level vocab size:  14830
Subword-level vocab size:  64
After running BPE algorithm-
Word-level vocab size:  14830
Subword-level vocab size:  4447
Compression Ratio:  3.3348324713289856


In [63]:
dataset = read_sentiment_examples('data/train.txt')

In [65]:
counter = 5
for row in dataset:
    if counter > 0:
        print(" ".join(row.words))
        print(row.label)
        counter -= 1

the rock is destined to be the 21st century 's new `` conan '' and that he 's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
1
the gorgeously elaborate continuation of `` the lord of the rings '' trilogy is so huge that a column of words can not adequately describe co-writer\/director peter jackson 's expanded vision of j.r.r. tolkien 's middle-earth .
1
singer\/composer bryan adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece .
1
yet the act is still charming here .
1
whether or not you 're enlightened by any of derrida 's lectures on `` the other '' and `` the self , '' derrida is an undeniably fascinating and playful fellow .
1


In [67]:
" ".join(dataset[3].words)

'yet the act is still charming here .'

In [69]:
doc = tokenize_bpe(" ".join(dataset[3].words), myvocab)

In [71]:
doc.split()

['yet', 'the', 'act', 'is', 'still', 'charming', 'here', '.']

In [74]:
[list(myvocab.keys()).index(word) for word in doc.split()]

[126, 0, 127, 2, 128, 129, 130, 50]