In [1]:
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split


In [2]:
# add shortcut to drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


# NBOW model for text classification
Neural bag of words (NBOW) model for a text classification task described here:
https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf 

# Subjectivity Dataset
The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:

In [3]:
cd drive/MyDrive/

/content/drive/MyDrive


In [4]:
#!mkdir Data_CBOW
#!wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
#!tar -xvf rotten_imdb.tar.gz -C Data_CBOW


In [5]:
!ls Data_CBOW/ 

plot.tok.gt9.5000  quote.tok.gt9.5000  rotten_imdb.tar.gz  subjdata.README.1.0


In [6]:
! head -10 Data_CBOW/plot.tok.gt9.5000


the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . 
emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . 
spurning her mother's insistence that she get on with her life , mary is thrown out of the house , rejected by joe , and expelled from school as she grows larger with child . 
amitabh can't believe the board of directors and his mind is filled with revenge and what better revenge than robbing the bank himself , ironic as it may sound . 
she , among others excentricities , talks to a small rock , gertrude , like if she was alive . 
this gives the girls a fair chance of pulling the wool over their eyes using their sexiness to poach any last vestige of common sense the dons might have had . 
styled after vh1's " behin

In [7]:
! head -10 Data_CBOW/quote.tok.gt9.5000

smart and alert , thirteen conversations about one thing is a small gem . 
color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . 
it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . 
a light-hearted french film about the spiritual quest of a fashion model seeking peace of mind while in a love affair with a veterinarian who is a non-practicing jew . 
my wife is an actress has its moments in looking at the comic effects of jealousy . in the end , though , it is only mildly amusing when it could have been so much more . 
works both as an engaging drama and an incisive look at the difficulties facing native americans . 
even a hardened voyeur would require the patience of job to get through this interminable , shapeless documentary about the swinging subculture . 
when perry fists a bull at the moore farm , it's only a matter of time before he gets the upper hand in matters 

In [8]:
PATH = Path("Data_CBOW")

In [9]:
list(PATH.iterdir())

[PosixPath('Data_CBOW/subjdata.README.1.0'),
 PosixPath('Data_CBOW/quote.tok.gt9.5000'),
 PosixPath('Data_CBOW/plot.tok.gt9.5000'),
 PosixPath('Data_CBOW/rotten_imdb.tar.gz')]

# Tokenization
Tokenization is the task of chopping up text into pieces, called tokens.


In [10]:
# We need each line in the file 
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [11]:
obj_lines = read_file(PATH/"plot.tok.gt9.5000")


In [12]:
obj_lines[0]


'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \n'

In [13]:
np.array(obj_lines[0].strip().lower().split(" "))

array(['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a',
       'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi',
       'from', 'a', 'hunter', '.'], dtype='<U8')

# Split dataset in train and test


In [14]:
from sklearn.model_selection import train_test_split

sub_content = read_file(PATH/"quote.tok.gt9.5000")
obj_content = read_file(PATH/"plot.tok.gt9.5000")
sub_content = np.array([line.strip().lower() for line in sub_content])
obj_content = np.array([line.strip().lower() for line in obj_content])
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [15]:
print("Sub text and label:", X[0], y[0])
print("Obj text and label:",X[5000], y[5000])

Sub text and label: smart and alert , thirteen conversations about one thing is a small gem . 0.0
Obj text and label: the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . 1.0


In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
X_train[:1], y_train[:1]


(array(['will god let her fall or give her a new path ?'], dtype='<U691'),
 array([1.]))

# Word to index mapping
Compute a vocabulary of words based on the training set and mapping from word to an index.




In [18]:
from collections import defaultdict
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab      

In [91]:
#Getting the vocabulary from the training set
word_count = get_vocab(X_train)
#print(word_count)
#print(word_count.keys())
#print(len(word_count.keys()))

In [20]:
# Delete words with frequency < 5
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]
len(word_count.keys())


4065

In [21]:
## Set an index for each word in the vocab
vocab2index = {"":0, "UNK":1} # init with padding and unknown
words = ["", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [22]:
#vocab2index


# Sentence encoding
Here we encode each sentence as a sequence of indices corresponding to each word.

In [23]:
x_train_len = np.array([len(x.split()) for x in X_train])
x_valid_len = np.array([len(x.split()) for x in X_valid])

In [24]:
np.percentile(x_train_len, 99) # let set the max sequence len to N=40


55.0

In [25]:
X_train[0]


'will god let her fall or give her a new path ?'

In [26]:
# returns the index of the word or the index or "UNK" otherwise
vocab2index.get("?", vocab2index["UNK"])

2

In [27]:
np.array([vocab2index.get(w, vocab2index["UNK"]) for w in X_train[0].split()])


array([ 7, 11,  9, 12,  4,  3,  6, 12, 10,  8,  5,  2])

In [28]:
def encode_sentence(s, N=40):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc, l

In [29]:
encode_sentence(X_train[0])


(array([ 7, 11,  9, 12,  4,  3,  6, 12, 10,  8,  5,  2,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0], dtype=int32), 12)

# Dataset

In [30]:
class SubjectivityDataset(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        x, s = encode_sentence(x)
        return x, self.y[idx], s
    
train_ds = SubjectivityDataset(X_train, y_train)
valid_ds = SubjectivityDataset(X_valid, y_valid)

In [31]:
batch_size=2
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [32]:
x, y, s = next(iter(train_dl))


In [33]:
x


tensor([[   1, 2347,   17,    1,    1,   36,   73, 3913,    1,    1,   57,   10,
          944, 1570,   18,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   1,   36, 2506,   36,    1,    1,   36,    1,   36,    1, 3790,   36,
            1,   25, 1543,  477, 3552,   60, 4009,   80,   17, 3721,   18,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], dtype=torch.int32)

In [34]:
y


tensor([1., 1.], dtype=torch.float64)

In [35]:
# lenght of each vector
s

tensor([15, 23])

# Embedding layer
Most deep learning models use a dense vectors of real numbers as representation of words (word embeddings), as opposed to a one-hot encoding representations. The module torch.nn.Embedding is used to represent word embeddings. It takes two arguments: the vocabulary size, and the dimensionality of the embeddings. The embeddings are initialized with random vectors.

In [36]:
# An Embedding module containing 10 words with embedding size 4
# Embedding weights will be initialized at random.
# Note that the padding_idx has embedding vector 0.

embed = nn.Embedding(10, 4, padding_idx=0)
embed.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000],
        [-0.5874,  1.4726, -0.1593,  1.5657],
        [ 0.0842,  1.7152,  1.0018, -1.1170],
        [ 0.1814, -1.6173,  1.2255, -0.4668],
        [-0.4929, -2.5295, -1.2409,  0.8116],
        [ 0.8431,  0.2836, -2.2855, -2.5100],
        [ 0.1989,  0.2340,  1.7802, -0.8061],
        [-1.9849,  0.0965,  0.2781,  0.1412],
        [-0.2190,  0.0187,  0.0615,  2.0734],
        [ 0.0130,  1.8356, -0.8755,  2.6680]], requires_grad=True)

In [37]:
# given a list of ids (a sentence) we can "look up" the embedding corresponing to each id
# can you see that some vectors are the same?
a = torch.LongTensor([[1,4,1,5,1,0]])
embed(a)

tensor([[[-0.5874,  1.4726, -0.1593,  1.5657],
         [-0.4929, -2.5295, -1.2409,  0.8116],
         [-0.5874,  1.4726, -0.1593,  1.5657],
         [ 0.8431,  0.2836, -2.2855, -2.5100],
         [-0.5874,  1.4726, -0.1593,  1.5657],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)

In [38]:
# Can we get a dense representation for the sentence a?
embed(a).sum(dim=1)/5

tensor([[-0.2824,  0.4344, -0.8009,  0.5998]], grad_fn=<DivBackward0>)

In [39]:
# What if we have two sentences where the first sentence has length 3 and the last sentence has length 2?.
# In order to use the same embedding layer we use padding at the end of the second sentence.
# Our model will take an average of the word embeddings for each sentence.

a = torch.LongTensor([[1,4,1], [1,3,0]])
s = torch.FloatTensor([3, 2]) # here is the size of the vector
print(embed(a))
print("--------------------------------------------")
print(embed(a).sum(dim=1) / s.view(s.shape[0], 1))

tensor([[[-0.5874,  1.4726, -0.1593,  1.5657],
         [-0.4929, -2.5295, -1.2409,  0.8116],
         [-0.5874,  1.4726, -0.1593,  1.5657]],

        [[-0.5874,  1.4726, -0.1593,  1.5657],
         [ 0.1814, -1.6173,  1.2255, -0.4668],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)
--------------------------------------------
tensor([[-0.5559,  0.1386, -0.5199,  1.3144],
        [-0.2030, -0.0723,  0.5331,  0.5495]], grad_fn=<DivBackward0>)


# Continuous Bag of Words Model


In [83]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size=100):
        super(CBOW, self).__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.linear1 = nn.Linear(emb_size, 30)
        self.linear2 = nn.Linear(30, 1)
        
    def forward(self, x, s):
        x = self.word_emb(x)
        x = x.sum(dim=1)/ s.view(s.shape[0], 1)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        return x

In [84]:
model = CBOW(vocab_size=5, emb_size=3)
model.word_emb.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000],
        [-1.0605, -0.7193, -0.3752],
        [-0.6447, -1.7190,  0.0336],
        [-0.2961, -0.8309, -0.3394],
        [-1.8844, -1.0602, -0.7543]], requires_grad=True)

In [85]:
y_hat = model(a, s)
y_hat

tensor([[0.7080],
        [0.4535]], grad_fn=<AddmmBackward0>)

# Training the CBOW model


In [86]:
len(valid_ds)     # 2000
V = len(words)    # 4067                         
model = CBOW(vocab_size=V, emb_size=50)

train_dl = DataLoader(train_ds, batch_size=500, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=2000)

loss_func = nn.BCEWithLogitsLoss(reduction='mean')


In [87]:
def test_metrics(model):
    model.eval()
    for x, y, s in valid_dl:
        s = torch.FloatTensor(s.float()).view(s.shape[0], 1)
        y = y.unsqueeze(1)
        y_hat = model(x.long(), s)
        loss = loss_func(y_hat, y)
        boundary = (y_hat.max() - abs(y_hat.min()))/2.0
        y_pred = y_hat > boundary
#        y_pred = y_hat > 0
        correct = (y_pred.float() == y).float().sum() # how many prediction match the true label
        accuracy = correct/y_pred.shape[0]
    return loss.item(), accuracy.item()

In [88]:
# accuracy of a random model --> 0.5
test_metrics(model)

(0.6943347522313706, 0.4975000023841858)

In [89]:
def train_epocs(model, epochs=10, lr=0.01, weight_decay=1e-5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    for i in range(epochs):
        model.train()
        for x, y, s in train_dl:
            y = y.unsqueeze(1)
            s = s.type(torch.Tensor).view(s.shape[0], 1)
            y_hat = model(x.long(), s)
            loss = loss_func(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        val_loss, val_acc = test_metrics(model)
        print("train_loss: %.3f val_loss: %.3f --- val_accuracy: %.3f" % (loss.item(), val_loss, val_acc))

In [90]:
V = len(words)
model = CBOW(vocab_size=V, emb_size=50)
train_epocs(model, epochs=20)

train_loss: 0.525 val_loss: 0.527 --- val_accuracy: 0.536
train_loss: 0.327 val_loss: 0.338 --- val_accuracy: 0.825
train_loss: 0.233 val_loss: 0.270 --- val_accuracy: 0.891
train_loss: 0.179 val_loss: 0.253 --- val_accuracy: 0.897
train_loss: 0.109 val_loss: 0.265 --- val_accuracy: 0.897
train_loss: 0.080 val_loss: 0.285 --- val_accuracy: 0.891
train_loss: 0.052 val_loss: 0.315 --- val_accuracy: 0.884
train_loss: 0.050 val_loss: 0.345 --- val_accuracy: 0.882
train_loss: 0.049 val_loss: 0.380 --- val_accuracy: 0.883
train_loss: 0.029 val_loss: 0.415 --- val_accuracy: 0.877
train_loss: 0.023 val_loss: 0.446 --- val_accuracy: 0.879
train_loss: 0.019 val_loss: 0.483 --- val_accuracy: 0.873
train_loss: 0.014 val_loss: 0.508 --- val_accuracy: 0.876
train_loss: 0.007 val_loss: 0.530 --- val_accuracy: 0.876
train_loss: 0.016 val_loss: 0.554 --- val_accuracy: 0.876
train_loss: 0.007 val_loss: 0.572 --- val_accuracy: 0.873
train_loss: 0.006 val_loss: 0.583 --- val_accuracy: 0.873
train_loss: 0.