In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 




In [2]:
# add shortcut to drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


# Dataset
Sentiment classification on the imdb dataset.
https://ai.stanford.edu/~amaas/data/sentiment/

In [3]:
cd drive/MyDrive/

/content/drive/MyDrive


In [4]:
# !mkdir Data_Imdb
# !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -zxvf aclImdb_v1.tar.gz -C Data_Imdb


In [5]:
from pathlib import Path
PATH = Path("Data_Imdb/aclImdb/")
list(PATH.iterdir())

[PosixPath('Data_Imdb/aclImdb/imdb.vocab'),
 PosixPath('Data_Imdb/aclImdb/test'),
 PosixPath('Data_Imdb/aclImdb/imdbEr.txt'),
 PosixPath('Data_Imdb/aclImdb/README'),
 PosixPath('Data_Imdb/aclImdb/train')]

In [6]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

# Tokenization


In [7]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en_core_web_sm')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [8]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

# Vocab2index

In [9]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('Data_Imdb/aclImdb/train/pos/11546_9.txt'),
 PosixPath('Data_Imdb/aclImdb/train/pos/11422_8.txt'),
 PosixPath('Data_Imdb/aclImdb/train/pos/11534_7.txt'),
 PosixPath('Data_Imdb/aclImdb/train/pos/11815_10.txt'),
 PosixPath('Data_Imdb/aclImdb/train/pos/11632_7.txt')]

In [11]:
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [12]:
print(len(counts.keys()))
#counts


103163


In [13]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

len(counts.keys())

33893

In [14]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [15]:
#vocab2index

# Dataset

In [16]:
# note that spacy_tok takes a while run it just once
def encode_sentence(path, vocab2index, N=400, padding_start=True):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[N-l:] = enc1[:l]
    else:
        enc[:l] = enc1[:l]
    return enc, l

In [17]:
path = PATH/"train/neg/211_4.txt"
path.read_text()


'Hilariously obvious "drama" about a bunch of high school (I think) kids who enjoy non-stop hip-hop, break dancing, graffiti and trying to become a dj at the Roxy--or something. To be totally honest I was so bored I forgot! Even people who love the music agree this movie is terribly acted and--as a drama--failed dismally. We\'re supposed to find this kids likable and nice. I found them bland and boring. The one that I REALLY hated was Ramon. He does graffiti on subway trains and this is looked upon as great. Excuse me? He\'s defacing public property that isn\'t his to begin with. Also these "great" kids tap into the city\'s electricity so they can hold a big dance party at an abandoned building. Uh huh. So we\'re supposed to find a bunch of law breakers lovable and fun.<br /><br />I could forgive all that if the music was good but I can\'t stand hip hop. The songs were--at best--mediocre and they were nonstop! They\'re ALWAYS playing! It got to the point that I was fast-forwarding thro

In [18]:
spacy_tok(path.read_text())[:10]

['Hilariously',
 'obvious',
 '"',
 'drama',
 '"',
 'about',
 'a',
 'bunch',
 'of',
 'high']

In [19]:
vocab2index['drama']

210

In [20]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400, padding_start=True)

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            1,  2544,    89,   210,    89,   229,   105,  3569,    30,
         1862,  2334,   171,   270,   732,   174,   540,   232,  1402,
         3601,   398,  2034, 18018,   398,  7917,     8,  3969,  3025,
            8, 20317,    14,  1323,    45,   178,   105,     1,   114,
      

In [21]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=400, padding_start=True):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        # it is important to run encode_sentence in the init
        self.X = [encode_sentence(path, vocab2index, N, padding_start) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [22]:
train_ds_v0 = ImdbDataset(PATH, padding_start=True)
valid_ds_v0 = ImdbDataset(PATH, "test", padding_start=True)

In [25]:
batch_size = 1000
train_dl_v0 = DataLoader(train_ds_v0, batch_size=batch_size, shuffle=True)
valid_dl_v0 = DataLoader(valid_ds_v0, batch_size=batch_size)

In [26]:
train_ds_v0[1]


(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

# LSTM

In [27]:
# Input dim is the dimension of the embedding for each word (2 in the example)
# Output dim is the dimension of the hidden layer (4 in this example)
# batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). 
lstm = nn.LSTM(2, 4, batch_first=True)  

In [28]:
inputs = [torch.randn(1, 2) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(1, len(inputs), -1)
print(inputs.shape)
inputs

tensor([[[-0.2389, -0.2758],
         [ 0.1086, -0.6741],
         [-0.2651,  0.1672],
         [-1.1916, -1.0372],
         [ 0.2675, -1.7877]]])

In [29]:
# RNNs with batch_first=True assume this input shape
# input shape should be bash_size x seq_len x embedding dimension
inputs.shape

torch.Size([1, 5, 2])

In [30]:
out, (hidden, cell) = lstm(inputs)


In [31]:
print(out.shape)
out

torch.Size([1, 5, 4])


tensor([[[ 0.0316, -0.0424,  0.0687, -0.0839],
         [ 0.0583, -0.0238,  0.0999, -0.0924],
         [ 0.0496, -0.0644,  0.1124, -0.1580],
         [ 0.0994, -0.0650,  0.1343, -0.2784],
         [ 0.1083,  0.0412,  0.1601, -0.1735]]], grad_fn=<TransposeBackward0>)

In [32]:
hidden


tensor([[[ 0.1083,  0.0412,  0.1601, -0.1735]]], grad_fn=<StackBackward0>)

# Model

In [33]:
class LSTMV0Model(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMV0Model,self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [34]:
def train_epocs_v0(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # s is not used in this model
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [35]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        # s is not used here
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [36]:
batch_size = 5000
train_dl = DataLoader(train_ds_v0, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds_v0, batch_size=batch_size)

In [38]:
vocab_size = len(words)
print(vocab_size)
model_v0 = LSTMV0Model(vocab_size, 50, 50).cuda()

33895


RuntimeError: ignored

In [None]:
train_epocs_v0(model_v0, epochs=30, lr=0.01)
