In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import pickle
import random
import os

torch.manual_seed(10)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [33]:
class BiLSTM(nn.Module): 
    # This NLP part Will consist of two bidirectional lstm layers and it's output is 
    # determined by the LSTM's last hidden states or output vectors.

    # This will take as an input a sequence of words and output the last hidden layer
    # the last hidden states of 2-layer bidirectional LSTM will be the input of the last multimodel network 

    def __init__(self, embedding_dim, hidden_dim = 256, layer_dim =2, output_dim = 10):
        super(BiLSTM, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        #Hidden dimensions
        self.hidden_dim = hidden_dim # maybe set this to 256

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building the LSTM 
        # batch_first = True causes the input/output to be of shape 3D (batch_dim, seq_dim, feature_dim) 
        # output will be the same dim as the hidden dim
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.Sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        
        # Initialize hidden state with zeros
        # self.layer_dim * 2. because we have one going forwards and another going backwards
        h0 = torch.randn(self.layer_dim * 2, x.size(0), self.hidden_dim)
        
        
        
        # Initialize cell state
        c0 =  torch.randn(self.layer_dim * 2, x.size(0), self.hidden_dim)
        
        # We suppose we are conducting a 28 time steps In case of using 
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm1(x, (h0.detach(), c0.detach()))

        # out = self.fc(out.view(out.size(0), -1))
          
        # Without the activation function, out will contain the last hidden layer.
        # This could be obtianed from hn[-1] as well.
        out = out[:, -1, :]
        
        out = self.fc(out)
        
        out = self.Sigmoid(out)
        
        return out
        
        # Index hidden state of last time step
        # out.size() --> 256, 100, 256 if we have (input dim = 100 and hidden dim = 100)
        # out[:, -1, :] => 256, 256 --> because we just want the last time step hidden states
        #out = out[:, -1, :] # without an activation function

        # now our: out.size() --> 256, 10 (if output dimension is equal to 10)
        #return out

In [44]:
# 1041 embedding size
model = BiLSTM(1041)
#model = BiLSTM(17)


#loss_function = nn.NLLLoss()
loss_function = nn.CrossEntropyLoss()
#loss_function = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [4]:
# A common headache in this competition is the lack of determinism in the results due to cudnn, the following solves the issue
def seed_everything(seed=10):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

# Here i have to get the ELMO vector in a form, in a format of numpy array.
with open('document_vector.pickle', 'rb') as file:
    vectors = pickle.load(file)
    file.close()

vectors = np.delete(vectors, 1, 1)
vectors = np.delete(vectors, 0, 1)

vectors = torch.from_numpy(np.array(vectors, dtype=np.float64)).view(521, 1, -1).float()
vectors.size()

In [4]:
# Reading the features NOTE: Do not run this it is already saved under vectors.pickle
i = 0
vectors = np.array([])
labels = np.array([])
for file in os.scandir('../../feature_extraction/features/'):
    if file.name.endswith('csv'):
        vector = pd.read_csv(file)
        if (vector.shape[1] == 1044 or vector.shape[1] == 1047):
            # Some files have issues
            continue
        elif (vector.shape[1] == 1046):
            # Some files have an unnecessary column
            vector.drop([vector.columns[0]], axis=1, inplace=True)
        if i == 0:
            vectors = vector.to_numpy()
        else: 
            vectors = np.concatenate((vectors, vector.to_numpy()))
        i+=1
        print(i, end="\r")

166

In [5]:
with open("./vectors.pickle", "wb") as f:
    pickle.dump(vectors, f)

In [6]:
with open("./vectors.pickle", 'rb') as f:
    vectors = pickle.load(f)
    
labels = np.delete(vectors[:, [2, -1]], 29089, 0)
vectors = np.delete(vectors[:, 3:-1], 29089, 0)
print(vectors)

[[0.25 1 0 ... -0.14363256096839905 0.20427550375461576
  0.1786535382270813]
 [0.09090909090909093 1 0 ... 0.06341240555047989 -0.1516282707452774
  0.11076557636260986]
 [0.1 1 0 ... 0.1343698501586914 0.3798027038574219 0.04407154396176338]
 ...
 [0.14285714285714285 1 0 ... 0.42102766036987305 0.7613739967346191
  0.15271705389022827]
 [0.0 0 0 ... 0.3141244351863861 0.14763681590557098 -0.17566196620464325]
 [0.09090909090909093 1 0 ... -0.07197027653455734 0.3706480264663696
  0.028581261634826664]]


In [7]:
vectors.shape

(56750, 1041)

In [9]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(vectors, labels[:, 1], test_size=0.2, random_state=42,stratify=labels[:, 1])

In [10]:
x_train.shape

(45400, 1041)

In [11]:
x_test.shape

(11350, 1041)

In [13]:
y_train

array([10, 10, 10, ..., 10, 7, 10], dtype=object)

In [36]:
labels = torch.from_numpy(np.array(y_train, dtype=np.int64)).to(device)
labels = labels -1
print(labels.size())
v = torch.from_numpy(np.array(x_train, dtype=np.float64)).view(-1, 1, 1041).float().to(device)
print(v.size())

torch.Size([45400])
torch.Size([45400, 1, 1041])


In [37]:
# Normalize the vectors
train_mean = v.mean(dim=0, keepdim=True)
train_std = v.std(dim=0, keepdim=True)
normalized_v = (v - train_mean)/train_std

In [38]:
print(normalized_v.size())

torch.Size([45400, 1, 1041])


In [39]:
v

tensor([[[ 0.1429,  1.0000,  0.0000,  ..., -0.0888,  0.5042,  0.3134]],

        [[ 0.2000,  1.0000,  0.0000,  ...,  0.3553,  0.9244, -0.3827]],

        [[ 0.0000,  0.0000,  0.0000,  ..., -0.1613, -0.3639,  0.2646]],

        ...,

        [[ 0.0909,  1.0000,  0.0000,  ...,  0.0367,  0.3092,  0.2239]],

        [[ 0.1429,  1.0000,  0.0000,  ...,  0.2157,  0.9857, -0.3155]],

        [[ 0.0000,  0.0000,  0.0000,  ..., -0.2064,  0.4096, -0.3888]]])

In [40]:
normalized_v[torch.isnan(normalized_v)] = 0
normalized_v

tensor([[[ 0.7844,  1.5200,  0.0000,  ..., -0.1561,  1.1776,  1.5523]],

        [[ 1.2905,  1.5200,  0.0000,  ...,  1.5181,  2.5751, -0.4552]],

        [[-0.4810, -0.6579,  0.0000,  ..., -0.4292, -1.7097,  1.4115]],

        ...,

        [[ 0.3242,  1.5200,  0.0000,  ...,  0.3172,  0.5291,  1.2940]],

        [[ 0.7844,  1.5200,  0.0000,  ...,  0.9916,  2.7792, -0.2616]],

        [[-0.4810, -0.6579,  0.0000,  ..., -0.5993,  0.8631, -0.4729]]])

In [41]:
print(train_mean)
print(train_std)

tensor([[[ 0.0543,  0.3021,  0.0000,  ..., -0.0474,  0.1501, -0.2248]]])
tensor([[[0.1129, 0.4592, 0.0000,  ..., 0.2653, 0.3007, 0.3467]]])


In [42]:
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __getitem__(self,index):
        data,target = self.dataset[index]
        return data,target,index
    def __len__(self):
        return len(self.dataset)

In [None]:
# Training the model
batch_loss = 0
batch_size = 1041
epochs = 100

train = MyDataset(torch.utils.data.TensorDataset(normalized_v, labels))
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle=True)

avg_loss = 0

for epoch in range(epochs):
    
    running_loss = 0.0
    model.train()
    # Clear for the gradients.
    model.zero_grad()
    
    for i, (x_batch, y_batch, index) in enumerate(train_loader):
        label_scores = model(x_batch)
        # Calculate loss, backpropagate, and update weights/parameters by calling opt.step()
        loss = loss_function(label_scores, y_batch)
        # TODO: check this line
        #optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        avg_loss += loss.item()/len(train_loader)
            
        print("Epoch: {0}/300. Loss: {2:.2f} Progress: {1}%".format(epoch, int((epoch * 100)/300), avg_loss / (epoch+1)) , end="\r")
        
print("Bi-LSTM model training is done!                           ", end='\r')
print("final labels {0}".format(label_scores))

Epoch: 8/300. Loss: 1.55 Progress: 2%

In [49]:
label_scores.size()

torch.Size([45438, 7])

In [50]:
y = []

for p in label_scores:
    i = 0
    m = 0
    for index, e in enumerate(p):
        if float(e) > m:
            i = index
            m = e
    y.append(i)

sigmoid = nn.Sigmoid()
testing = sigmoid(label_scores)

In [15]:
def accuracy(labels, predicted):
    total_accuracy = 0
    n = 0
    for y, p in zip(labels, predicted):
        if (y == p):
            total_accuracy +=1
        n+=1
    return total_accuracy/n

In [52]:
accuracy(labels, y)

0.8399357366081254

Model v1 = Feature vector all

Model v2 = Feature vector only layout

Model v3 = Feature vector all + sigmoid on training

Model v4 = Feature vector layout + sigmoid on training

In [53]:
torch.save(model, "modelv4")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [25]:
model = torch.load("modelv1")
model.eval()

BiLSTM(
  (lstm1): LSTM(1041, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=7, bias=True)
  (sigmoid): Sigmoid()
)

In [26]:
words_labels = model(v) 

In [19]:
words_labels

tensor([[0.3093, 0.7172, 0.1550,  ..., 0.1614, 0.7669, 0.9488],
        [0.3154, 0.7545, 0.1534,  ..., 0.1635, 0.7730, 0.9277],
        [0.3152, 0.7551, 0.1532,  ..., 0.1636, 0.7734, 0.9275],
        ...,
        [0.3124, 0.7509, 0.1514,  ..., 0.1623, 0.7783, 0.9298],
        [0.3122, 0.7508, 0.1512,  ..., 0.1624, 0.7785, 0.9299],
        [0.3147, 0.7247, 0.1603,  ..., 0.1658, 0.7601, 0.9383]],
       grad_fn=<SigmoidBackward>)

In [20]:
words_labels.size()

torch.Size([45438, 7])

In [58]:
y = []

for p in words_labels:
    i = 0
    m = 0
    for index, e in enumerate(p):
        if float(e) > m:
            i = index
            m = e
    y.append(i)

In [60]:
accuracy(labels, y)

0.8399357366081254

In [65]:
def precision_of(labels, y, of):
    n = 0
    precision = 0
    c = 0
    for label, p in zip(labels, y):
        if (label == p) and (p == of):
            c += 1
        if (p == of):
            n+=1
            
    precision = 0 if n==0 else c / n
    return precision

In [71]:
p0 = precision_of(labels, y, 0)
p1 = precision_of(labels, y, 1)
p2 = precision_of(labels, y, 2)
p3 = precision_of(labels, y, 3)
p4 = precision_of(labels, y, 4)
p5 = precision_of(labels, y, 5)
p6 = precision_of(labels, y, 6)

precision = (p0+p1+p2+p3+p4+p5+p6) / 7

In [73]:
precision

0.11999081951544648

In [78]:
def recall_of(labels, y, of):
    n = 0
    recall = 0
    c = 0
    for label, p in zip(labels, y):
        if (label == p) and (p == of):
            c += 1
        if (label == of):
            n+=1
            
    recall = 0 if n==0 else c / n
    return recall

In [80]:
p0 = recall_of(labels, y, 0)
p1 = recall_of(labels, y, 1)
p2 = recall_of(labels, y, 2)
p3 = recall_of(labels, y, 3)
p4 = recall_of(labels, y, 4)
p5 = recall_of(labels, y, 5)
p6 = recall_of(labels, y, 6)

recall = (p0+p1+p2+p3+p4+p5+p6) / 7

In [81]:
recall

0.14285714285714285