In [None]:
import pandas as pd
import numpy as np

### Cleaning and Exploring the [data](https://nlp.stanford.edu/projects/snli/)

In [None]:
INPUT_FILE = 'snli_1.0_train.jsonl'
COLUMNS = ['gold_label', 'sentence1', 'sentence2']

In [None]:
df = pd.read_json(INPUT_FILE, lines = True)
df = df[COLUMNS]

In [None]:
df.groupby('gold_label').count()

In [None]:
df = df[ df['gold_label'] != '-']

In [None]:
df.groupby('gold_label').count()

### Processing pre-trained embeddings : [GLOve](https://nlp.stanford.edu/projects/glove/)

In [None]:
def process_embeddings(path):
    vocab = []
    idx = 0
    lookup = {}
    vectors = []
    with open(path, 'rb') as f:
        for l in f:
            try:
                line = l.decode().split()
                word = line[0]
                vect = np.array(line[1:]).astype(np.float32) # import numpy as np
                vocab.append(word)
                vectors.append(vect)
                lookup[word] = idx
                idx += 1
                
            except Exception as e:
                print(e)    
    return vocab, lookup, vectors

In [None]:
def build_embedding_matrix(vocab, lookup, vectors):
    num_embeddings = len(vocab) + 2
    embedding_dim = len(vectors[0])
    weights_matrix = np.zeros( (num_embeddings, embedding_dim) )
    unknown_index = len(vocab)
    padding_index = unknown_index + 1
    for word in vocab:
        index = lookup[word]
        weights_matrix[index] = vectors[index]
    weights_matrix[unknown_index] = np.random.normal(scale=0.6, size=(embedding_dim, ))
    weights_matrix[padding_index] = np.zeros( (embedding_dim,))
    print(weights_matrix.shape)
    return weights_matrix

In [None]:
GLOVE_PATH = 'glove.6B.50d.txt'

In [None]:
vocab, lookup, vectors = process_embeddings(GLOVE_PATH)

In [None]:
wm = build_embedding_matrix(vocab, lookup, vectors)

### Building a model with pre-trained embeddings

In [None]:
import torch.nn as nn
import torch

In [None]:
class model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_weigths_matrix):
        super(model, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim) # ( vocab_size, embedding_dimension )
        self.embedding.load_state_dict({'weight': torch.tensor(embedding_weigths_matrix, dtype=torch.float64)})
        self.embedding.requires_grad = False
    
    def forward(self, inputs):
        return self.embedding(inputs)

_in = torch.tensor([[1,2,3], [0,4,3]])
print('Inputs shape : ', _in.shape)
_out = model(wm.shape[0], wm.shape[1], wm)(_in)
print('Outputs shape : ', _out.shape)

### Stitching it together

In [None]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
puncts = set([_t for _t in string.punctuation]) # import string
stop_words = set(stopwords.words('english')) # from nltk.corpus import stopwords
stop_words = stop_words.union(puncts)

def tokenize(sentence, sequence_length):
    tokens = []
    sentence = sentence.lower()
    pad_token = len(vocab) + 1
    for _tok in word_tokenize(sentence):
        if _tok not in stop_words:
            if _tok in vocab:
                tokens.append(lookup[_tok])
            else:
                tokens.append(len(vocab))
    tokens = tokens + [pad_token for i in range(sequence_length-len(tokens))]
    return tokens[:sequence_length]

In [None]:
df['sentence1'][0]

In [None]:
tokens = tokenize(df['sentence1'][0], sequence_length=15)
print(tokens)

In [None]:
_in = torch.tensor([tokens])
print('Inputs shape : ', _in.shape)
_out = model(wm.shape[0], wm.shape[1], wm)(_in)
print('Outputs shape : ', _out.shape)