In [2]:
import torch
import pandas as pd
import numpy as np

In [3]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, csv_file, transform=None, target_transform=None):
        self.csv_data = pd.read_csv(csv_file)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.csv_data)

    def __getitem__(self, idx):
        text = self.csv_data.iloc[idx, 1]
        label = self.csv_data.iloc[idx, 0]
        if self.transform:
            text = torch.tensor(self.transform(text), dtype=torch.int64)
        if self.target_transform:
            label = self.target_transform(label)
        return text, label

In [4]:
train_dataset = CustomDataset(csv_file=r'IMDB_train.csv')
valid_dataset = CustomDataset(csv_file=r'IMDB_valid.csv')
test_dataset = CustomDataset(csv_file=r'IMDB_test.csv')

In [8]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')

train_df = pd.read_csv(r'IMDB_train.csv')
train_tokenized = train_df['texts'].map(tokenizer)

In [9]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(train_tokenized, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [11]:
vocab.get_itos()[1]

'<unk>'

In [23]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    batch_text2id_seq = [torch.tensor(vocab(tokenizer(_text)), dtype=torch.int64) for _text, _ in batch]
    batch_label_seq = torch.tensor([_label for _, _label in batch], dtype=torch.float)
    batch_label_seq = batch_label_seq.unsqueeze(1)
    
    
    batch_padded = pad_sequence(batch_text2id_seq, batch_first=True)
    return batch_padded.to(device), batch_label_seq.to(device)

In [12]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100
    start_time = time.time()
    for idx, (text, label) in enumerate(dataloader):
        opt.zero_grad()
        predicted_label = model(text)
        loss = loss_func(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        opt.step()
        total_acc += ((torch.sigmoid(predicted_label) > 0.5) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f} | lr: {:.0e}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count, scheduler.get_last_lr()[0]))
            total_acc, total_count = 0, 0
            start_time = time.time()
            
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    
    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            total_acc += ((torch.sigmoid(predicted_label) > 0.5) == label).sum().item()
            total_count += label.size(0)
        return total_acc/total_count

In [26]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        out_channels,
        kernel_sizes,
        padding_idx,
        padding=[0, 0, 0],
        dropout=0.5,
    ):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx)

        self.conv_0 = nn.Conv1d(emb_dim, out_channels[0], kernel_sizes[0], padding=padding[0])
        self.conv_1 = nn.Conv1d(emb_dim, out_channels[1], kernel_sizes[1], padding=padding[1])
        self.conv_2 = nn.Conv1d(emb_dim, out_channels[2], kernel_sizes[2], padding=padding[2])
        
        self.fc = nn.Linear(torch.tensor(out_channels).sum(), 1)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        embedded = embedded.permute(0, 2, 1)
        
        conved_0 = F.relu(self.conv_0(embedded))
        conved_1 = F.relu(self.conv_1(embedded))
        conved_2 = F.relu(self.conv_2(embedded))
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
            
        return self.fc(cat)

In [30]:
vocab_size = len(vocab)
emb_dim = 200
out_channels = [256, 256, 256]
kernel_sizes = [2, 3, 5]
padding=[1, 2, 3]
dropout=0.5
padding_idx=vocab['<pad>']


model = CNN(
    vocab_size=vocab_size,
    emb_dim=emb_dim,
    out_channels=out_channels,
    kernel_sizes=kernel_sizes,
    dropout=dropout,
    padding=padding,
    padding_idx=padding_idx
)

model = model.to(device)

opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.StepLR(opt, 1.0, gamma=0.1)

BATCH_SIZE = 128
max_epochs = 20

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=False, collate_fn=collate_batch)

In [31]:
total_accu = None

for epoch in range(1, max_epochs + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
   
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   100/  274 batches | accuracy    0.647 | lr: 1e-03
| epoch   1 |   200/  274 batches | accuracy    0.764 | lr: 1e-03
-----------------------------------------------------------
| end of epoch   1 | time: 12.17s | valid accuracy    0.849 
-----------------------------------------------------------
| epoch   2 |   100/  274 batches | accuracy    0.797 | lr: 1e-03
| epoch   2 |   200/  274 batches | accuracy    0.829 | lr: 1e-03
-----------------------------------------------------------
| end of epoch   2 | time: 12.15s | valid accuracy    0.879 
-----------------------------------------------------------
| epoch   3 |   100/  274 batches | accuracy    0.831 | lr: 1e-03
| epoch   3 |   200/  274 batches | accuracy    0.853 | lr: 1e-03
-----------------------------------------------------------
| end of epoch   3 | time: 12.24s | valid accuracy    0.884 
-----------------------------------------------------------
| epoch   4 |   100/  274 batches | accuracy    0.848 | lr: 1

In [35]:
evaluate(test_dataloader)

0.8965333333333333

In [33]:
def ev(sen):
    model.eval()
    with torch.no_grad():
        tokenized = tokenizer(sen)
        inp = torch.tensor(vocab(tokenized)).unsqueeze(0).to(device)
        pred = torch.sigmoid(model(inp))
    return pred.item()
ev('It was a fantastic performance !')

0.9983785152435303

In [34]:
sentences = ['Best film ever', 
             'Such a great show!', 
             'It was a horrible movie', 
             'I\'ve never watched something as bad', 
             'It is a disgusting movie!', 
             'So-so. I\'d watched something better you know']
for sen in sentences:
    print(ev(sen))

0.9947662353515625
0.9979516863822937
0.010136223398149014
0.06495773792266846
0.0990145280957222
0.5555111765861511
