In [1]:
import os
import sys
import time
import argparse

import numpy as np
import pandas as pd
from random import randint

import torch
from torch.autograd import Variable
import torch.nn as nn

import nltk
nltk.download('punkt')

ModuleNotFoundError: No module named 'nltk'

# 1. Preprocesing
* Load Dataset
* Tokenize


In [5]:
#URL='/content/drive/MyDrive/multinli_1.0/'
URL='multinli_1.0/'
train = pd.read_csv(URL+'multinli_1.0_train.txt', delimiter = "\t",error_bad_lines=False)
dev_matched = pd.read_csv(URL+'multinli_1.0_dev_matched.txt', delimiter = "\t",error_bad_lines=False)
dev_mismatched = pd.read_csv(URL+'multinli_1.0_dev_mismatched.txt', delimiter = "\t",error_bad_lines=False)

train = train[['gold_label', 'sentence1', 'sentence2']]
dev_matched = dev_matched[['gold_label', 'sentence1', 'sentence2']]
dev_mismatched = dev_mismatched[['gold_label', 'sentence1', 'sentence2']]

# Drop the missing row
train = train.dropna()
train = train.reset_index(drop=True)

# Convert categorical label
train['gold_label'] = pd.factorize(train['gold_label'])[0]
train

b'Skipping line 24810: expected 15 fields, saw 16\nSkipping line 33961: expected 15 fields, saw 16\n'
b'Skipping line 75911: expected 15 fields, saw 16\nSkipping line 100114: expected 15 fields, saw 16\n'
b'Skipping line 150638: expected 15 fields, saw 16\nSkipping line 158834: expected 15 fields, saw 16\nSkipping line 173104: expected 15 fields, saw 16\nSkipping line 178252: expected 15 fields, saw 16\n'
b'Skipping line 221951: expected 15 fields, saw 16\n'
b'Skipping line 286845: expected 15 fields, saw 16\nSkipping line 314110: expected 15 fields, saw 16\n'


Unnamed: 0,gold_label,sentence1,sentence2
0,0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...
1,1,you know during the season and i guess at at y...,You lose the things to the following level if ...
2,1,One of our number will carry out your instruct...,A member of my team will execute your orders w...
3,1,How do you know? All this is their information...,This information belongs to them.
4,0,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.
...,...,...,...
391120,2,"Clearly, California can - and must - do better.",California cannot do any better.
391121,0,It was once regarded as the most beautiful str...,So many of the original buildings had been rep...
391122,1,Houseboats are a beautifully preserved traditi...,The tradition of houseboats originated while t...
391123,0,Obituaries fondly recalled his on-air debates ...,The obituaries were beautiful and written in k...


# 2. Experiment
## 2.1. import InferSense
* Import pretrained model
    + version 1 (GloVe) or
    + version 2 (fastText)
* create iterator

In [7]:
from InferSense_models import InferSent


In [10]:
model_version = 1 # 1 is trained with GloVe, 2 with fastText
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model_emb = InferSent(params_model)
model_emb.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [11]:
# Keep it on CPU or put it on GPU
use_cuda = False
model_emb = model_emb.cuda() if use_cuda else model_emb

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model_emb.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
model_emb.build_vocab_k_words(K=100000)

Vocab size : 100000


In [21]:
embeddings[0]

array([0.04041788, 0.13023938, 0.05888408, ..., 0.00592342, 0.        ,
       0.01562545], dtype=float32)

In [18]:
embeddings = model_emb.encode(train['sentence1'][1:1000].to_list(), bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 19569/22158 (88.3%)
Speed : 57.4 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 999


In [None]:
def iterator_func(X1,X2,y,batch_size):
    size = len (X)
    permutation = np.random.permutation(size)#randomize the values so the same class is not in only one batch
    iterator = []
    for i in range(0,size,batch_size):
        indices = permutation[i:i+batch_size]
        batch = {}
        batch['sentence1'] = [X1[i] for i in indices]
        batch['sentence2'] = [X2[i] for i in indices]
        batch['label'] = [y[i] for i in indices]
        
        #order by length
        batch['sentence1'],batch['sentence2'],batch['label'] = zip(*sorted(zip(
            batch['sentence1'],batch['sentence2'],batch['label']),key = lambda x: len(x[0])+len(x[1]), reverse = True))
        batch ['lenght'] = [len(batch['sentence1'][j])+len(batch['sentence1'][j]) for j in range(batch_size)]
        
        # to tensor
        batch['lenght'] = torch.IntTensor(batch['lenght'])
        batch['sentence1'] = torch.Tensor(batch['sentence1'])
        batch['sentence2'] = torch.Tensor(batch['sentence2'])
        batch['label'] = torch.Tensor(batch['label'])
        #batch['sentence1'] = torch.nn.utils.rnn.pad_sequence(batch['sentence1'],batch_first=True).t()
        
        #to device
        batch['lenght'] = batch['lenght'].to(device)
        batch['sentence1'] = batch['sentence1'].to(device)
        batch['sentence2'] = batch['sentence2'].to(device)
        batch['label'] = batch['label'].to(device)
        
        iterator.append(batch)
    
    return iterator 
        

In [None]:
def iterator_batch_func(X1,X2,y, model_emb, batch_size):
    """
    it creates a batch preprocecced with teh given embedding model. The inputs are:
    - X1: sentences 1 filtered by the batch selected
    - X2: sentences 1 filtered by the batch selected
    - y: labels filtered by the batch selected
    - emb_model: model used to trandomr sentence 1 and 2 to the embedding space    
    """
    
    batch = {}
    # apply embedding        
    batch['sentence1'] =  model_emb.encode(X1.to_list(), bsize=batch_size, tokenize=False, verbose=False)
    batch['sentence2'] =  model_emb.encode(X1.to_list(), bsize=batch_size, tokenize=False, verbose=False)
    batch['label'] = y
    
    batch['sentence1'] = torch.Tensor(batch['sentence1'])
    batch['sentence2'] = torch.Tensor(batch['sentence2'])
    batch['label'] = torch.Tensor(batch['label'])
    #batch['sentence1'] = torch.nn.utils.rnn.pad_sequence(batch['sentence1'],batch_first=True).t()

    batch['lenght'] = batch['lenght'].to(device)
    batch['sentence1'] = batch['sentence1'].to(device)
    batch['sentence2'] = batch['sentence2'].to(device)
    batch['label'] = batch['label'].to(device)
    
    return batch 
        

## 2.2. Model Definition
* define model
* create instance of the model

In [10]:
2048*64*300/10**9

0.0393216

In [None]:
# Defin the NN


In [None]:
# create instace
input_DIM = ..
embedding_dim = ..
Hidden_dim = ..
Output_dim = ..


## 2.3. Train the model
* training Parameters
    + optimizer: SGD
    + criterion: Cross Entropy Loss
    + Device: cuda or cpu

In [None]:
batch_size = 128# size of the cuda...
device = torch.device ('cda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = ...

In [26]:
len(train)/128

3055.6640625