In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import pickle
import random
import os

torch.manual_seed(10)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
class BiLSTM(nn.Module): 
    # This NLP part Will consist of two bidirectional lstm layers and it's output is 
    # determined by the LSTM's last hidden states or output vectors.

    # This will take as an input a sequence of words and output the last hidden layer
    # the last hidden states of 2-layer bidirectional LSTM will be the input of the last multimodel network 

    def __init__(self, embedding_dim, hidden_dim = 256, layer_dim =2, output_dim = 10):
        super(BiLSTM, self).__init__()
        
        
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        self.embedding_dim = embedding_dim
        
        #Hidden dimensions
        self.hidden_dim = hidden_dim # maybe set this to 256

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building the LSTM 
        # batch_first = True causes the input/output to be of shape 3D (batch_dim, seq_dim, feature_dim) 
        # output will be the same dim as the hidden dim
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.Sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        
        # Initialize hidden state with zeros
        # self.layer_dim * 2. because we have one going forwards and another going backwards
        h0 = torch.randn(self.layer_dim * 2, x.size(0), self.hidden_dim, device=self.device)
        
        
        
        # Initialize cell state
        c0 =  torch.randn(self.layer_dim * 2, x.size(0), self.hidden_dim, device=self.device)
        
        # We suppose we are conducting a 28 time steps In case of using 
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm1(x, (h0.detach(), c0.detach()))

        # out = self.fc(out.view(out.size(0), -1))
          
        # Without the activation function, out will contain the last hidden layer.
        # This could be obtianed from hn[-1] as well.
        out = out[:, -1, :]
        
        out = self.fc(out)
        
        out = self.Sigmoid(out)
        
        return out
        
        # Index hidden state of last time step
        # out.size() --> 256, 100, 256 if we have (input dim = 100 and hidden dim = 100)
        # out[:, -1, :] => 256, 256 --> because we just want the last time step hidden states
        #out = out[:, -1, :] # without an activation function

        # now our: out.size() --> 256, 10 (if output dimension is equal to 10)
        #return out

In [156]:
# 1041 embedding size
model = BiLSTM(1041)
model.to(device)
#model = BiLSTM(17)


# Class weights (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
weights = torch.tensor([0.69,2.95,0.08,0.02,0.77,0.017,18.59,0 ,0.007,76.83], dtype=torch.float32)
weights = weights / weights.sum()
weights = 1.0 / weights
weights = weights / weights.sum()


#loss_function = nn.NLLLoss()
loss_function = nn.CrossEntropyLoss(weight=weights.to(device))
#loss_function = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [77]:
# A common headache in this competition is the lack of determinism in the results due to cudnn, the following solves the issue
def seed_everything(seed=10):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

# Here i have to get the ELMO vector in a form, in a format of numpy array.
with open('document_vector.pickle', 'rb') as file:
    vectors = pickle.load(file)
    file.close()

vectors = np.delete(vectors, 1, 1)
vectors = np.delete(vectors, 0, 1)

vectors = torch.from_numpy(np.array(vectors, dtype=np.float64)).view(521, 1, -1).float()
vectors.size()

In [5]:
# Reading the features NOTE: Do not run this it is already saved under vectors.pickle
i = 0
vectors = np.array([])
labels = np.array([])
for file in os.scandir('../../feature_extraction/features/'):
    if file.name.endswith('csv'):
        vector = pd.read_csv(file)
        if (vector.shape[1] == 1044 or vector.shape[1] == 1047):
            # Some files have issues
            continue
        elif (vector.shape[1] == 1046):
            # Some files have an unnecessary column
            vector.drop([vector.columns[0]], axis=1, inplace=True)
        if i == 0:
            vectors = vector.to_numpy()
        else: 
            vectors = np.concatenate((vectors, vector.to_numpy()))
        i+=1
        print(i, end="\r")

166

In [6]:
with open("./vectors.pickle", "wb") as f:
    pickle.dump(vectors, f)

In [141]:
with open("./vectors.pickle", 'rb') as f:
    vectors = pickle.load(f)


labels = vectors[:, [2, -1]]
removed_index = np.where(labels[:, 1] == 77)
labels = np.delete(labels, removed_index, 0)
vectors = np.delete(vectors[:, 3:-1], removed_index, 0)

print(vectors)

[[0.1428571428571428 1 0 ... 0.0158199667930603 -0.2104856222867965
  0.1239955797791481]
 [0.0833333333333333 1 0 ... 0.2286127358675003 0.324407696723938
  0.3137013614177704]
 [0.0 0 0 ... -0.2346080988645553 -0.1007604598999023 -0.6514030694961548]
 ...
 [0.0 0 0 ... 0.1466432362794876 0.0629211440682411 -0.1241814717650413]
 [0.125 1 0 ... -0.1323692202568054 0.7320983409881592
  -0.0597063340246677]
 [0.0 0 0 ... -0.0877691507339477 0.035791415721178 0.2522882223129272]]


In [142]:
unique, counts = np.unique(labels[:, 1], return_counts=True)
dict(zip(unique, counts))

{1: 396, 2: 1677, 3: 49, 4: 16, 5: 442, 6: 10, 7: 10551, 9: 4, 10: 43605}

In [143]:
print(vectors.shape)
print(labels.shape)

(56750, 1041)
(56750, 2)


In [144]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(vectors, labels[:, 1], test_size=0.2, random_state=42,stratify=labels[:, 1])

In [145]:
x_train.shape

(45400, 1041)

In [146]:
x_test.shape

(11350, 1041)

In [147]:
y_train.shape

(45400,)

In [148]:
labels = torch.from_numpy(np.array(y_train, dtype=np.int64)).to(device)
labels = labels -1
print(labels.size())
v = torch.from_numpy(np.array(x_train, dtype=np.float64)).view(-1, 1, 1041).float().to(device)
print(v.size())

torch.Size([45400])
torch.Size([45400, 1, 1041])


In [149]:
# Normalize the vectors
train_mean = v.mean(dim=0, keepdim=True)
train_std = v.std(dim=0, keepdim=True)
normalized_v = (v - train_mean)/train_std

In [150]:
print(normalized_v.size())

torch.Size([45400, 1, 1041])


In [151]:
v

tensor([[[ 0.0000,  0.0000,  0.0000,  ..., -0.2070,  0.4090, -0.3843]],

        [[ 0.0000,  0.0000,  0.0000,  ..., -0.3165,  0.2743, -0.3129]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.2249, -0.2426,  0.0372]],

        ...,

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.2406,  0.4763, -0.3737]],

        [[ 0.1111,  1.0000,  0.0000,  ..., -0.3673,  0.2978,  0.4063]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.1202, -0.2013, -0.4923]]],
       device='cuda:0')

In [152]:
normalized_v[torch.isnan(normalized_v)] = 0
normalized_v

tensor([[[-0.4821, -0.6600,  0.0000,  ..., -0.5969,  0.8625, -0.4642]],

        [[-0.4821, -0.6600,  0.0000,  ..., -1.0098,  0.4134, -0.2578]],

        [[-0.4821, -0.6600,  0.0000,  ...,  1.0317, -1.3095,  0.7541]],

        ...,

        [[-0.4821, -0.6600,  0.0000,  ...,  1.0907,  1.0867, -0.4334]],

        [[ 0.4985,  1.5152,  0.0000,  ..., -1.2013,  0.4918,  1.8211]],

        [[-0.4821, -0.6600,  0.0000,  ...,  0.6366, -1.1721, -0.7764]]],
       device='cuda:0')

In [153]:
print(train_mean)
print(train_std)

tensor([[[ 0.0546,  0.3034,  0.0000,  ..., -0.0487,  0.1503, -0.2237]]],
       device='cuda:0')
tensor([[[0.1133, 0.4597, 0.0000,  ..., 0.2652, 0.3000, 0.3460]]],
       device='cuda:0')


In [154]:
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __getitem__(self,index):
        data,target = self.dataset[index]
        return data,target,index
    def __len__(self):
        return len(self.dataset)

In [157]:
# Training the model
batch_loss = 0
batch_size = 1041
epochs = 300

train = MyDataset(torch.utils.data.TensorDataset(normalized_v, labels))
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle=True)

avg_loss = 0

for epoch in range(epochs):
    
    running_loss = 0.0
    model.train()
    # Clear for the gradients.
    model.zero_grad()
    
    for i, (x_batch, y_batch, index) in enumerate(train_loader):
        label_scores = model(x_batch)
        # Calculate loss, backpropagate, and update weights/parameters by calling opt.step()
        loss = loss_function(label_scores, y_batch)
        # TODO: check this line
        #optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        avg_loss += loss.item()/len(train_loader)
            
        print("Epoch: {0}/{3}. Loss: {2:.2f} Progress: {1}%".format(epoch, int((epoch * 100)/300), avg_loss / (epoch+1), epochs) , end="\r")
        
print("Bi-LSTM model training is done!                           ", end='\r')
print("final labels {0}".format(label_scores))

final labels tensor([[0.5270, 0.5245, 0.4911,  ..., 0.4545, 0.4762, 0.4647],
        [0.4740, 0.4916, 0.4676,  ..., 0.5395, 0.5126, 0.5546],
        [0.5182, 0.4850, 0.5051,  ..., 0.4847, 0.4712, 0.4703],
        ...,
        [0.4596, 0.4886, 0.4478,  ..., 0.4557, 0.4793, 0.4751],
        [0.5041, 0.5313, 0.4236,  ..., 0.5362, 0.5381, 0.5311],
        [0.4775, 0.5152, 0.4824,  ..., 0.4538, 0.4766, 0.5107]],
       device='cuda:0', grad_fn=<SigmoidBackward>)


In [161]:
label_scores.size()

torch.Size([637, 10])

In [162]:
torch.save(model, "NLPModelV2")

sigmoid = nn.Sigmoid()
testing = sigmoid(label_scores)

In [163]:
def accuracy(labels, predicted):
    total_accuracy = 0
    n = len(predicted)
    for y, p in zip(labels, predicted):
        if (y == p):
            total_accuracy +=1
    return total_accuracy/n

Model v1 = Feature vector standarized and without class weights

Model v2 = Feature vector standarized and with class weights

In [164]:
model.eval()

BiLSTM(
  (lstm1): LSTM(1041, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=10, bias=True)
  (Sigmoid): Sigmoid()
)

In [165]:
v_test = torch.from_numpy(np.array(x_test, dtype=np.float64)).view(-1, 1, 1041).float().to(device)

words_labels = model(v_test) 

In [168]:
words_labels

tensor([[0.4900, 0.4985, 0.5334,  ..., 0.5728, 0.4702, 0.5405],
        [0.5072, 0.4691, 0.5011,  ..., 0.5110, 0.5229, 0.5445],
        [0.4387, 0.5019, 0.4730,  ..., 0.4885, 0.4629, 0.5447],
        ...,
        [0.5095, 0.4933, 0.4889,  ..., 0.5466, 0.5255, 0.4523],
        [0.5278, 0.4745, 0.4873,  ..., 0.5408, 0.5146, 0.5460],
        [0.5235, 0.4721, 0.5073,  ..., 0.5313, 0.4698, 0.5294]],
       device='cuda:0', grad_fn=<SigmoidBackward>)

In [125]:
words_labels.size()

torch.Size([11350, 10])

In [169]:
y = []

for p in words_labels:
    i = 0
    m = 0
    for index, e in enumerate(p):
        if float(e) > m:
            i = index
            m = e
    y.append(i+1)

In [170]:
set(y)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [183]:
print(np.array(y))
print(y_test)

[ 8 10 10 ...  8 10  8]
[10 7 7 ... 10 7 10]


In [172]:
set(y_test)

{1, 2, 3, 4, 5, 6, 7, 9, 10}

y = []

for p in words_labels:
    i = 0
    m = 0
    for index, e in enumerate(p):
        if float(e) > m:
            i = index
            m = e
    y.append(i)

In [173]:
accuracy(y_test, y)

0.12140969162995595

In [174]:
def precision_of(labels, y, of):
    n = 0
    precision = 0
    c = 0
    for label, p in zip(labels, y):
        if (label == p) and (p == of):
            c += 1
        if (p == of):
            n+=1
            
    precision = 0 if n==0 else c / n
    return precision

In [175]:
p0 = precision_of(y_test, y, 0)
p1 = precision_of(y_test, y, 1)
p2 = precision_of(y_test, y, 2)
p3 = precision_of(y_test, y, 3)
p4 = precision_of(y_test, y, 4)
p5 = precision_of(y_test, y, 5)
p6 = precision_of(y_test, y, 6)
p7 = precision_of(y_test, y, 7)
p8 = precision_of(y_test, y, 8)
p9 = precision_of(y_test, y, 9)
p10 = precision_of(y_test, y, 10)

precision = (p0+p1+p2+p3+p4+p5+p6+p8+p9+p7+p10) / 10

In [176]:
print(precision)

0.09914015347161295


In [177]:
def recall_of(labels, y, of):
    n = 0
    recall = 0
    c = 0
    for label, p in zip(labels, y):
        if (label == p) and (p == of):
            c += 1
        if (label == of):
            n+=1
            
    recall = 0 if n==0 else c / n
    return recall

In [178]:
p0 = recall_of(y_test, y, 0)
p1 = recall_of(y_test, y, 1)
p2 = recall_of(y_test, y, 2)
p3 = recall_of(y_test, y, 3)
p4 = recall_of(y_test, y, 4)
p5 = recall_of(y_test, y, 5)
p6 = recall_of(y_test, y, 6)
p7 = recall_of(y_test, y, 7)
p8 = recall_of(y_test, y, 8)
p9 = recall_of(y_test, y, 9)
p10 = recall_of(y_test, y, 10)

recall = (p0+p1+p2+p3+p4+p5+p6+p8+p9+p7+p10) / 10

In [179]:
recall

0.05875414636642144