In [1]:
from torchtext.datasets import IMDB
import torchdata
import torch

In [2]:
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')


In [3]:
## Step 1: create the datasets
from torch.utils.data.dataset import random_split
torch.manual_seed(1)

train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [4]:
a = next(iter(test_dataset))
a

(1,
 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they

In [5]:
a = next(iter(train_dataset))
a

(2,
 'An extra is called upon to play a general in a movie about the Russian Revolution. However, he is not any ordinary extra. He is Serguis Alexander, former commanding general of the Russia armies who is now being forced to relive the same scene, which he suffered professional and personal tragedy in, to satisfy the director who was once a revolutionist in Russia and was humiliated by Alexander. It can now be the time for this broken man to finally "win" his penultimate battle. This is one powerful movie with meticulous direction by Von Sternberg, providing the greatest irony in Alexander\'s character in every way he can. Jannings deserved his Oscar for the role with a very moving performance playing the general at his peak and at his deepest valley. Powell lends a sinister support as the revenge minded director and Brent is perfect in her role with her face and movements showing so much expression as Jannings\' love. All around brilliance. Rating, 10.')

In [6]:
# we will first find the unique words (tokens) in the training dataset
## Step 2: find unique tokens (words)
import re
from collections import Counter, OrderedDict
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

In [7]:
token_counts = Counter()

In [8]:
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

In [9]:
print('Vocab-size:', len(token_counts))

Vocab-size: 69023


In [10]:
token_counts

Counter({'an': 17204,
         'extra': 244,
         'is': 85847,
         'called': 1140,
         'upon': 675,
         'to': 107513,
         'play': 1752,
         'a': 130057,
         'general': 619,
         'in': 74646,
         'movie': 35149,
         'about': 13734,
         'the': 267877,
         'russian': 250,
         'revolution': 160,
         'however': 2892,
         'he': 24075,
         'not': 24329,
         'any': 6089,
         'ordinary': 222,
         'serguis': 2,
         'alexander': 100,
         'former': 416,
         'commanding': 37,
         'of': 116119,
         'russia': 66,
         'armies': 17,
         'who': 17112,
         'now': 3704,
         'being': 5224,
         'forced': 505,
         'relive': 19,
         'same': 3244,
         'scene': 4223,
         'which': 9563,
         'suffered': 116,
         'professional': 270,
         'and': 130797,
         'personal': 510,
         'tragedy': 291,
         'satisfy': 73,
         'dir

In [11]:
#  we are going to map each unique word to a unique integer.
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)
print(vocab)

Vocab()


In [12]:
print([vocab[token] for token in ['ceyhun', 'sahin', 'and', 'his', 'family']])

[1, 1, 3, 27, 206]


In [13]:
## Step 3-A: define the functions for transformation
# text_pipeline function to transform each text in the dataset
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.

In [14]:
# We will generate batches of samples using DataLoader and pass the data processing pipelines declared previously to the argument collate_fn.
## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text),dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list, label_list, lengths

In [15]:
## Take a small batch
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=5, shuffle=False, collate_fn=collate_batch)

In [16]:
text_batch, label_batch, length_batch = next(iter(dataloader))
length_batch

tensor([165,  86, 218, 145, 116])

In [17]:
print(label_batch.shape)

torch.Size([5])


In [18]:
print(length_batch)

tensor([165,  86, 218, 145, 116])


In [19]:
print(text_batch.shape)

torch.Size([5, 218])


In [20]:
test_dataset

ShardingFilterIterDataPipe

In [21]:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,shuffle=False, collate_fn=collate_batch)

In [22]:
source = next(iter(train_dl))[0]
source

tensor([[ 3648,  1075,  3717,  ...,     0,     0,     0],
        [   88,   121,    11,  ...,     0,     0,     0],
        [   10,    28,    76,  ...,     0,     0,     0],
        ...,
        [    4,  1368,    48,  ...,     0,     0,     0],
        [  168,    10,   122,  ...,   288,    29,   808],
        [23224, 19518,    91,  ...,     0,     0,     0]])

In [23]:
target = next(iter(train_dl))[1]
target

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
lengths = next(iter(train_dl))[2]
lengths

tensor([510, 293, 352, 129, 129, 282, 184, 133, 249, 392, 220, 248, 224, 214,
        153, 134, 147,  55, 623, 306, 129, 221, 131, 399, 123, 163, 150,  61,
        117, 221, 383, 365])

In [25]:
text_batch, label_batch, length_batch = next(iter(train_dl))
text_batch.shape

torch.Size([32, 983])

In [26]:
length_batch.shape

torch.Size([32])

## Embedding layers for sentence encoding

In [27]:
embedding = torch.nn.Embedding(
    num_embeddings=10, embedding_dim=3, padding_idx=0
)

In [28]:
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,3,4], [4,5,6,7]])
print(embedding(text_encoded_input))

tensor([[[-1.8292, -0.9281,  1.6486],
         [-0.2286,  0.0294, -1.8128],
         [-0.9500,  0.3597,  0.0832],
         [ 0.5263,  1.4874, -1.1160]],

        [[ 0.5263,  1.4874, -1.1160],
         [-1.0165,  0.5236, -0.3098],
         [ 0.1645,  0.3430, -0.5329],
         [-0.7423,  0.2471, -1.1142]]], grad_fn=<EmbeddingBackward0>)


# Building an RNN model


In [29]:
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size,1)

    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1,:,:]
        out = self.fc(out)
        return out


In [30]:
model = RNN(64,32)

In [31]:
print(model)

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [32]:
model(torch.randn(5, 3, 64))

tensor([[-0.2866],
        [-0.0270],
        [-0.2858],
        [-0.0062],
        [-0.1359]], grad_fn=<AddmmBackward0>)

# Building an RNN model for the sentiment analysis task

In [33]:
class RNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = torch.nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = torch.nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(fc_hidden_size, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = torch.nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1,:,:]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [34]:
len(vocab)

69025

In [35]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim,rnn_hidden_size, fc_hidden_size)
model

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

#### we will develop the train function to train the model on the given dataset for one epoch and return the classification accuracy and loss:

In [36]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
loss_fn = torch.nn.BCELoss()
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
    total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

### we will develop the evaluate function to measure the model’s performance on a given dataset

In [37]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]

            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [38]:
num_epochs = 3
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f}', f' val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.9987  val_accuracy: 1.0000
Epoch 1 accuracy: 1.0000  val_accuracy: 1.0000
Epoch 2 accuracy: 1.0000  val_accuracy: 1.0000


In [39]:
a = next(iter(test_dl))
a

(tensor([[  10,  115,  903,  ...,    0,    0,    0],
         [ 288,    2,  701,  ...,    0,    0,    0],
         [  92,    4,  489,  ...,    0,    0,    0],
         ...,
         [ 118,  123,  192,  ...,    0,    0,    0],
         [6863, 1767,    6,  ...,    0,    0,    0],
         [ 624,   10,   51,  ...,    0,    0,    0]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([247, 216, 138, 360, 136, 190, 297, 173, 130, 171, 131, 196, 116, 417,
         140, 189, 256, 169, 266, 220, 177, 446, 126, 358, 259, 210, 234, 217,
         130,  71, 219, 204]))

In [40]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

TypeError: _IterDataPipeSerializationWrapper instance doesn't have valid length

In [None]:
import numpy as np
w = np.zeros((5,5)); w

In [None]:
for i in range(len(w)):
    for j in range(len(w)):
        w[i][j] = float(((i-j)**2)/16) #as per formula, for this competition, N=5

In [None]:
w

In [None]:
from sklearn.metrics import confusion_matrix
actuals = np.array([4, 4, 3, 4, 4, 4, 1, 1, 2, 1])
preds   = np.array([0, 2, 1, 0, 0, 0, 1, 1, 2, 1])
O = confusion_matrix(actuals, preds); O

In [None]:
N=5

act_hist=np.zeros([N])
for item in actuals:
    act_hist[item]+=1

pred_hist=np.zeros([N])
for item in preds:
    pred_hist[item]+=1
print(act_hist)
print(pred_hist)

In [None]:
print(f'Actuals value counts:{act_hist}, Prediction value counts:{pred_hist}')

In [None]:
E = np.outer(act_hist, pred_hist); E

In [None]:
E = E/E.sum(); E.sum()

In [None]:
O = O/O.sum(); O.sum()

In [None]:
E

In [None]:
O

## Calculate Weighted Kappa

In [None]:
num=0
den=0
for i in range(len(w)):
    for j in range(len(w)):
        num+=w[i][j]*O[i][j]
        den+=w[i][j]*E[i][j]

weighted_kappa = (1 - (num/den)); weighted_kappa

In [None]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
counter = Counter(["a", "a", "b", "b", "b"])
print(counter)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
v1 = vocab(ordered_dict)
print(v1['a']) #prints 1
print(v1['out of vocab']) #raise RuntimeError since default index is not set


In [None]:
tokens = ['e', 'd', 'c', 'b', 'a']
#adding <unk> token and default index
unk_token = '<unk>'
default_index = -1
v2 = vocab(OrderedDict([(token, 1) for token in tokens]), specials=[unk_token])
v2.set_default_index(default_index)
print(v2['<unk>']) #prints 0
print(v2['out of vocab']) #prints -1
#make default index same as index of unk_token
v2.set_default_index(v2[unk_token])
v2['out of vocab'] is v2[unk_token] #prints True

In [41]:
()

()