In [1]:
import torch
from torchtext import data

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
%%time

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

train_data, valid_data, test_data = data.TabularDataset.splits(
    path = '/Users/anton/mywork/Datasets/Quora/split',
    train = 'train.csv', validation = 'valid.csv', test = 'test.csv',
    format = 'csv', skip_header = True,
    fields=[('qid', None), ('question_text', TEXT), ('target', LABEL)])

CPU times: user 10min 9s, sys: 1.97 s, total: 10min 11s
Wall time: 10min 12s


In [4]:
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)

In [5]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [6]:
LABEL.vocab.itos

['0', '1']

In [7]:
TEXT.vocab.freqs.most_common(20)

[('?', 1104694),
 ('the', 523352),
 ('What', 346418),
 ('to', 325289),
 ('a', 322923),
 ('in', 294044),
 ('is', 269293),
 ('of', 265776),
 ('I', 263782),
 ('How', 210455),
 ('and', 202251),
 ('do', 185149),
 (',', 184546),
 ('are', 172584),
 ('for', 161839),
 ('you', 160231),
 ('Why', 116347),
 ('it', 112506),
 ('can', 100494),
 ('Is', 88559)]

In [8]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key=lambda x: x.question_text,
    batch_size=BATCH_SIZE,
    device=device)

In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [11]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [12]:
model = model.to(device)
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [14]:
def precision_recall_comp(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    true_pos = ((rounded_preds == 1.0) & (y == 1.0)).sum()
    false_pos = ((rounded_preds == 1.0) & (y == 0.0)).sum()
    false_neg = ((rounded_preds == 0.0) & (y == 1.0)).sum()
    return true_pos, false_pos, false_neg

In [15]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.question_text).squeeze(1)
        
        loss = criterion(predictions, batch.target)
        
        acc = binary_accuracy(predictions, batch.target)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    epoch_true_pos = 0
    epoch_false_pos = 0
    epoch_false_neg = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.question_text).squeeze(1)
            
            loss = criterion(predictions, batch.target)
            
            acc = binary_accuracy(predictions, batch.target)
            true_pos, false_pos, false_neg = precision_recall_comp(predictions, batch.target)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_true_pos += true_pos.item()
            epoch_false_pos += false_pos.item()
            epoch_false_neg += false_neg.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_true_pos / (epoch_true_pos + epoch_false_pos), epoch_true_pos / (epoch_true_pos + epoch_false_neg)

In [22]:
%%time

N_EPOCHS = 2

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_precision, valid_recall = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

KeyboardInterrupt: 

In [20]:
test_loss, test_acc, test_precision, test_recall = evaluate(model, test_iterator, criterion)
test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test Precision: {test_precision*100:.2f}% | Test Recall: {test_recall*100:.2f}% | Test F1: {test_f1*100:.2f}%')

| Test Loss: 0.231 | Test Acc: 93.80% | Test Precision: 26.67% | Test Recall: 0.10% | Test F1: 0.20%
