In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import spacy
import time

In [2]:
SEED = 1337

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz (782.7 MB)
[K     |████████████████████████████████| 782.7 MB 5.8 kB/s  eta 0:00:01   |▉                               | 19.1 MB 1.5 MB/s eta 0:08:15     |█                               | 24.6 MB 1.5 MB/s eta 0:08:11     |█▏                              | 28.0 MB 1.5 MB/s eta 0:08:09     |████                            | 97.9 MB 86.9 MB/s eta 0:00:08     |██████▎                         | 153.1 MB 75.8 MB/s eta 0:00:09     |█████████▉                      | 240.0 MB 83.1 MB/s eta 0:00:07
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


### Import Data

In [7]:
train = pd.read_csv('../input/stanford-natural-language-inference-corpus/snli_1.0_train.csv')
test = pd.read_csv('../input/stanford-natural-language-inference-corpus/snli_1.0_test.csv')
valid = pd.read_csv('../input/stanford-natural-language-inference-corpus/snli_1.0_dev.csv')

### Remove null data values

In [8]:
# Check which values have null data
train.isnull().sum()

gold_label                     0
sentence1_binary_parse         0
sentence2_binary_parse         6
sentence1_parse                0
sentence2_parse                0
sentence1                      0
sentence2                      6
captionID                      0
pairID                         0
label1                         0
label2                    510782
label3                    510757
label4                    510769
label5                    513238
dtype: int64

In [9]:
# Remove data points for which label is not given or sentence2 is null

train = train.dropna(subset = ['sentence2'])
train = train[train["gold_label"] != "-"]
test = test[test["gold_label"] != "-"]
valid = valid[valid["gold_label"] != "-"]

In [10]:
# Check null values now
train.isnull().sum()

gold_label                     0
sentence1_binary_parse         0
sentence2_binary_parse         0
sentence1_parse                0
sentence2_parse                0
sentence1                      0
sentence2                      0
captionID                      0
pairID                         0
label1                         0
label2                    510769
label3                    510740
label4                    510753
label5                    512937
dtype: int64

### View data 

In [11]:
# View data
train.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,captionID,pairID,label1,label2,label3,label4,label5
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,3416050480.jpg#4,3416050480.jpg#4r1n,neutral,,,,
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",3416050480.jpg#4,3416050480.jpg#4r1c,contradiction,,,,
2,entailment,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,"( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",3416050480.jpg#4,3416050480.jpg#4r1e,entailment,,,,
3,neutral,( Children ( ( ( smiling and ) waving ) ( at c...,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...,Children smiling and waving at camera,They are smiling at their parents,2267923837.jpg#2,2267923837.jpg#2r1n,neutral,,,,
4,entailment,( Children ( ( ( smiling and ) waving ) ( at c...,( There ( ( are children ) present ) ),(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...,Children smiling and waving at camera,There are children present,2267923837.jpg#2,2267923837.jpg#2r1e,entailment,,,,


In [12]:
# View data size
train.shape

(549361, 14)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 549361 entries, 0 to 550151
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   gold_label              549361 non-null  object
 1   sentence1_binary_parse  549361 non-null  object
 2   sentence2_binary_parse  549361 non-null  object
 3   sentence1_parse         549361 non-null  object
 4   sentence2_parse         549361 non-null  object
 5   sentence1               549361 non-null  object
 6   sentence2               549361 non-null  object
 7   captionID               549361 non-null  object
 8   pairID                  549361 non-null  object
 9   label1                  549361 non-null  object
 10  label2                  38592 non-null   object
 11  label3                  38621 non-null   object
 12  label4                  38608 non-null   object
 13  label5                  36424 non-null   object
dtypes: object(14)
memory usage: 62.9+ MB

In [14]:
for idx, elt in train.iterrows():
    print('Premise: ', elt['sentence1'])
    print('Hypothesis: ', elt['sentence2'])
    print('Label: ', elt['gold_label'].title())
    print('-' * 80)
    
    if idx >= 10:
        break

Premise:  A person on a horse jumps over a broken down airplane.
Hypothesis:  A person is training his horse for a competition.
Label:  Neutral
--------------------------------------------------------------------------------
Premise:  A person on a horse jumps over a broken down airplane.
Hypothesis:  A person is at a diner, ordering an omelette.
Label:  Contradiction
--------------------------------------------------------------------------------
Premise:  A person on a horse jumps over a broken down airplane.
Hypothesis:  A person is outdoors, on a horse.
Label:  Entailment
--------------------------------------------------------------------------------
Premise:  Children smiling and waving at camera
Hypothesis:  They are smiling at their parents
Label:  Neutral
--------------------------------------------------------------------------------
Premise:  Children smiling and waving at camera
Hypothesis:  There are children present
Label:  Entailment
-------------------------------------

### Tokenization example

In [15]:
# Load nlp model from spacy for preprocessing the data
nlp = spacy.load('en_core_web_lg')

In [16]:
example_sentence = f"{train['sentence1'][0]} {train['sentence2'][0]} {train['gold_label'][0]}"
print(f"Before tokenization: {example_sentence}")

Before tokenization: A person on a horse jumps over a broken down airplane. A person is training his horse for a competition. neutral


In [17]:
tokenized_sentence = [token.text for token in nlp(example_sentence)]
print(f'Tokenized: {tokenized_sentence}')

Tokenized: ['A', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.', 'A', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.', 'neutral']


In [18]:
# Spacy tokenize
def tokenize(text):
    return [token.text in nlp(text)]

### Word vectorization example

In [26]:
# Disabling other pipes because we don't need them and it'll speed up this part a bit
with nlp.disable_pipes():
    vectors = np.array([token.vector for token in  nlp(example_sentence)])
    
vectors.shape

(23, 300)

In [42]:
with nlp.disable_pipes():
    def vectorize(text, max_len=100):
        vectorized = np.zeros((max_len, 300), dtype=int)
        doc_vectors = np.array([nlp(word).vector for word in text])
        vec_len = min(max_len, len(doc_vectors))
        vectorized[:vec_len] = doc_vectors[:vec_len]
        return vectorized, vec_len

In [None]:
%%time

train["premise"] = train["sentence1"].apply(lambda x: np.array(vectorize(x)))
train["hypothesis"] = train["sentence2"].apply(lambda x: np.array(vectorize(x)))
valid["premise"] = valid["sentence1"].apply(lambda x: np.array(vectorize(x)))
valid["hypothesis"] = valid["sentence2"].apply(lambda x: np.array(vectorize(x)))
test["premise"] = test["sentence1"].apply(lambda x: np.array(vectorize(x)))
test["hypothesis"] = test["sentence2"].apply(lambda x: np.array(vectorize(x)))

  """Entry point for launching an IPython kernel.


In [None]:
label_numbering = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
train['gold_label'] = train['gold_label'].apply(lambda x: label_numbering[x])
valid['gold_label'] = valid['gold_label'].apply(lambda x: label_numbering[x])
test['gold_label'] = test['gold_label'].apply(lambda x: label_numbering[x])

## Load data into Datasets and DataLoaders

In [None]:
class SNLI(Dataset):
    def __init__(self, X_1, X_2, y):
        self.X_1 = X_1
        self.X_2 = X_2
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X_1[idx][0].astype(np.int32)), torch.from_numpy(self.X_2[idx][0].astype(np.int32)), self.y[idx]

In [None]:
train_dataset = SNLI(list(train["premise"]), list(train["hypothesis"]), list(train["gold_label"]))
valid_dataset = SNLI(list(valid["premise"]), list(valid["hypothesis"]), list(valid["gold_label"]))
test_dataset = SNLI(list(test["premise"]), list(test["hypothesis"]), list(test["gold_label"]))

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=2048, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2048, shuffle=True)

## Building the model

In [None]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, 512, padding_idx=0)
        self.linear_1 = nn.Linear(512, 256)
        self.lstm = nn.LSTM(256, 256, num_layers=1, bidirectional=True)
        self.linear_2 = nn.Linear(256, 3)
        self.dropout = nn.Dropout(0.2)
        self.out = nn.LogSoftmax(dim=1)

    def forward(self, x_1, x_2):
        p = self.embedding(x_1)
        p = F.relu(self.linear_1(p))
        _, (h_x_1, _) = self.lstm(p)
        
        h = self.embedding(x_2)
        h = F.relu(self.linear_1(h))
        _, (h_x_2, _) = self.lstm(h)
        
        h_x_1 = torch.cat((h_x_1[-1], h_x_1[-2]), dim=-1)
        h_x_2 = torch.cat((h_x_2[-1], h_x_2[-2]), dim=-1)
        
        hidden_layer = torch.cat((h_x_1, h_x_2), dim=1)
        
        prediction = self.linear_2(hidden_layer)
        
        return prediction

In [None]:
input_dim = len(nlp.vocab)

In [None]:
def get_default_device():
    """Picks GPU if available, otherwise CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

In [None]:
num_epochs = 3
model = Model(input_dim).to(get_default_device())
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Defining functions

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            premise, hypothesis, labels = batch
            
            predictions = model(premise, hypothesis)
            loss = criterion(predictions, labels)
            
            epoch_loss += loss.item()
            correct += (predictions.argmax(1) == labels).type(torch.float).sum().item()
        
        acc = correct  / len(iterator)
    
    return epoch_loss / len(iterator), acc

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        premise, hypothesis, labels = batch
        
        predictions = model(premise, hypothesis)
        loss = criterion(predictions, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        acc = predictions.max(1).eq(labels).sum()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator) 

## Training the model

In [None]:
N_EPOCHS = 15

train_loss = []
valid_loss = []
train_acc = []
valid_acc = []

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
    
    train_loss.append(train_loss)
    valid_loss.append(valid_loss)
    train_acc.append(train_acc)
    valid_acc.append(valid_acc)
    
    end_time = time.time()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'curr-model.pt')
    
    print(f'Epoch: {epoch + 1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.3f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.3f}%')

In [None]:
model.load_state_dict(torch.load('curr-model.pt'))

test_loss, test_acc = evaluate(model, test_dataloader, criterion)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc * 100:.3f}%')