In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center><h1>Assignment 2 : Byte Pair Encoding Model and Bag of Words Model

<center><h2>Part 1 : Byte Pair Encoding Model Implementation

In [1]:
from collections import defaultdict

def get_stats(vocab):
    pairs = defaultdict(int)    # Setting/Initialising this defauldict() object with int parameter makes this useful for counting (like a bag or multiset in other languages).
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in vocab:
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab

# Example vocabulary of words with their frequency :
vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}

# Initial call for get_stats
pairs = get_stats(vocab)
print(pairs)

print('\n')

print(f"Best initial Pair: {max(pairs, key=pairs.get)}")

print('\n')

# Initial call for merge_vocab
vocab = merge_vocab(('e', 's'), vocab) # We call this for e and s since it has among the highest frequency in the pairs. (best pair)
print(vocab)

# Continuing the process via a recursive calling of pairs and merging them according to the highest frq.
num_merges = 10
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best_pair = max(pairs, key=pairs.get)
    vocab = merge_vocab(best_pair, vocab)
    print(vocab)

defaultdict(<class 'int'>, {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 8, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('e', 's'): 9, ('s', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3})


Best initial Pair: ('e', 's')


{'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
{'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
{'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
{'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
{'low </w>': 5, 'low e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
{'low </w>': 5, 'low e r </w>': 2, 'ne w est</w>': 6, 'w i d est</w>': 3}
{'low </w>': 5, 'low e r </w>': 2, 'new est</w>': 6, 'w i d est</w>': 3}
{'low </w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}
{'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}
{'low</w>': 5, 'low e r </w>': 2,

<center><h2>Part 2 : Bag of Words Model Implementation

In [2]:
# Imporing the libraries
import torch
import torch.optim as optim
from torch import nn
from torch.autograd import Variable
from collections import defaultdict

In [3]:
class BOW(nn.Module):
    def __init__(self, nwords, ntags):
        super(BOW, self).__init__()
        
        self.embedding = nn.Embedding(nwords, ntags)
        nn.init.xavier_uniform_(self.embedding.weight)
        self.bias = nn.Parameter(torch.zeros(ntags))
    
    def forward(self, words):
        emb = self.embedding(words)
        out = torch.sum(emb, dim=0) + self.bias
        out = out.view(1, -1)  # Ensure the output is 1xN
        return out

In [4]:
# Initialize word to index and tag to index mappings
w2i = defaultdict(lambda: len(w2i))  # word to index mapping
t2i = defaultdict(lambda: len(t2i))  # tags to index mapping
UNK = w2i["<unk>"]                   # Define UNK as the token for unknown words

def read_corpus(filename, update_vocab=True):
    with open(filename, 'r') as fl:
        for line in fl:
            tag, words = line.lower().strip().split(" ||| ")
            word_indices = [w2i[x] if not update_vocab else w2i[x] for x in words.split(" ")]
            tag_index = t2i[tag] if update_vocab else t2i.get(tag, UNK)
            yield (word_indices, tag_index)

def prepare_data(filename, update_vocab=True):
    data = list(read_corpus(filename, update_vocab))
    w2i.default_factory = lambda: UNK
    nwords = len(w2i)
    ntags = len(t2i)
    return data, nwords, ntags, dict(w2i), dict(t2i)

In [6]:
# the below two imports are required if we are keeping the BOW model & the data_preparation function in a separate module/package.
# OPTIONAL IMPORT 1:  from utils import prepare_data (HERE THE UTIL FOLDER/PACKAGE CONTAINS THE prepare_data FUNCTION.
# OPTIONAL IMPORT 2: from bow_model import BOW

# Load and prepare data
train_data, nwords, ntags, w2i, t2i = prepare_data("/kaggle/input/data-for-assignment2/train.txt")

# Define the model, loss function, and optimizer
model = BOW(nwords, ntags)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 25

for epoch in range(num_epochs):
    train_loss = 0.0
    for words, tag in train_data:
        words_tensor = torch.tensor(words, dtype=torch.long)
        tags_tensor = torch.tensor([tag], dtype=torch.long)
        
        optimizer.zero_grad()
        scores = model(words_tensor)
        loss = criterion(scores, tags_tensor)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Train Loss: {train_loss / len(train_data)}")

Epoch 1, Train Loss: 0.7257192384670506
Epoch 2, Train Loss: 0.5764438185405868
Epoch 3, Train Loss: 0.475957142351369
Epoch 4, Train Loss: 0.4057033377448942
Epoch 5, Train Loss: 0.35345555703902126
Epoch 6, Train Loss: 0.3129407311253935
Epoch 7, Train Loss: 0.2805097248972076
Epoch 8, Train Loss: 0.25390208594285546
Epoch 9, Train Loss: 0.23165648989428103
Epoch 10, Train Loss: 0.21277329718444585
Epoch 11, Train Loss: 0.19653339518553586
Epoch 12, Train Loss: 0.18240365790785537
Epoch 13, Train Loss: 0.1699826796624706
Epoch 14, Train Loss: 0.15896641288145885
Epoch 15, Train Loss: 0.14912299866646905
Epoch 16, Train Loss: 0.14027296131051795
Epoch 17, Train Loss: 0.13227482012267822
Epoch 18, Train Loss: 0.12501544465398087
Epoch 19, Train Loss: 0.11840393560463806
Epoch 20, Train Loss: 0.11236659597251329
Epoch 21, Train Loss: 0.10684175189449072
Epoch 22, Train Loss: 0.10177354933445082
Epoch 23, Train Loss: 0.09710749028793453
Epoch 24, Train Loss: 0.092791407576523
Epoch 25, T

In [10]:
# Printing the unique words and checking them. 
print(f"Number of Unique Words in Training Data : {nwords}")
for k,v in w2i.items():
    print(f"{k} - {v}")

Number of Unique Words in Training Data : 16582
<unk> - 0
effective - 1
but - 2
too-tepid - 3
biopic - 4
if - 5
you - 6
sometimes - 7
like - 8
to - 9
go - 10
the - 11
movies - 12
have - 13
fun - 14
, - 15
wasabi - 16
is - 17
a - 18
good - 19
place - 20
start - 21
. - 22
emerges - 23
as - 24
something - 25
rare - 26
an - 27
issue - 28
movie - 29
that - 30
's - 31
so - 32
honest - 33
and - 34
keenly - 35
observed - 36
it - 37
does - 38
n't - 39
feel - 40
one - 41
film - 42
provides - 43
some - 44
great - 45
insight - 46
into - 47
neurotic - 48
mindset - 49
of - 50
all - 51
comics - 52
-- - 53
even - 54
those - 55
who - 56
reached - 57
absolute - 58
top - 59
game - 60
offers - 61
combination - 62
entertainment - 63
education - 64
perhaps - 65
no - 66
picture - 67
ever - 68
made - 69
has - 70
more - 71
literally - 72
showed - 73
road - 74
hell - 75
paved - 76
with - 77
intentions - 78
steers - 79
turns - 80
in - 81
snappy - 82
screenplay - 83
curls - 84
at - 85
edges - 86
; - 87
clever - 8

In [7]:
# Initialize word to index and tag to index mappings
w2i = defaultdict(lambda: len(w2i))  # word to index mapping
t2i = defaultdict(lambda: len(t2i))  # tags to index mapping
UNK = w2i["<unk>"]  # Define UNK as the token for unknown words

def read_corpus(filename, update_vocab=True):
    with open(filename, 'r', encoding='utf-8') as fl:
        for line in fl:
            tag, words = line.lower().strip().split(" ||| ")
            word_indices = [w2i[x] if not update_vocab else w2i[x] for x in words.split(" ")]
            tag_index = t2i[tag] if update_vocab else t2i.get(tag, UNK)
            yield (word_indices, tag_index)

def prepare_data(filename, update_vocab=True):
    data = list(read_corpus(filename, update_vocab))
    w2i.default_factory = lambda: UNK
    nwords = len(w2i)
    ntags = len(t2i)
    return data, nwords, ntags, dict(w2i), dict(t2i)

In [8]:
# Evaluate on test data.
test_data, nwords_test, ntags_test, w2i_test, t2i_test = prepare_data("/kaggle/input/data-for-assignment2/test.txt",update_vocab=False)
test_loss = 0.0
with torch.no_grad():
    for words, tag in test_data:
        words_tensor = torch.tensor(words, dtype=torch.long)
        tags_tensor = torch.tensor([tag], dtype=torch.long)
        scores = model(words_tensor)
        loss = criterion(scores, tags_tensor)
        test_loss += loss.item()
    print(f"Test Loss: {test_loss / len(test_data)}")

Test Loss: 0.2939100143623291


In [9]:
print(f"Number of Unique Words: {nwords_test}")
for k,v in w2i_test.items():
    print(f"{k} - {v}")

Number of Unique Words: 7930
<unk> - 0
effective - 1
but - 2
too-tepid - 3
biopic - 4
if - 5
you - 6
sometimes - 7
like - 8
to - 9
go - 10
the - 11
movies - 12
have - 13
fun - 14
, - 15
wasabi - 16
is - 17
a - 18
good - 19
place - 20
start - 21
. - 22
emerges - 23
as - 24
something - 25
rare - 26
an - 27
issue - 28
movie - 29
that - 30
's - 31
so - 32
honest - 33
and - 34
keenly - 35
observed - 36
it - 37
does - 38
n't - 39
feel - 40
one - 41
film - 42
provides - 43
some - 44
great - 45
insight - 46
into - 47
neurotic - 48
mindset - 49
of - 50
all - 51
comics - 52
-- - 53
even - 54
those - 55
who - 56
reached - 57
absolute - 58
top - 59
game - 60
offers - 61
combination - 62
entertainment - 63
education - 64
perhaps - 65
no - 66
picture - 67
ever - 68
made - 69
has - 70
more - 71
literally - 72
showed - 73
road - 74
hell - 75
paved - 76
with - 77
intentions - 78
steers - 79
turns - 80
in - 81
snappy - 82
screenplay - 83
curls - 84
at - 85
edges - 86
; - 87
clever - 88
want - 89
hate - 

***< WORK IN PROGRESS UNDER THIS CELL >***

***References***
* https://docs.python.org/3/library/collections.html#collections.defaultdict
* https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
* https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html