In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import pickle
import random
import os

torch.manual_seed(10)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
class BiLSTM(nn.Module): 
    # This NLP part Will consist of two bidirectional lstm layers and it's output is 
    # determined by the LSTM's last hidden states or output vectors.

    # This will take as an input a sequence of words and output the last hidden layer
    # the last hidden states of 2-layer bidirectional LSTM will be the input of the last multimodel network 

    def __init__(self, embedding_dim, hidden_dim = 256, layer_dim =2, output_dim = 10):
        super(BiLSTM, self).__init__()
        
        
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        self.embedding_dim = embedding_dim
        
        #Hidden dimensions
        self.hidden_dim = hidden_dim # maybe set this to 256

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building the LSTM 
        # batch_first = True causes the input/output to be of shape 3D (batch_dim, seq_dim, feature_dim) 
        # output will be the same dim as the hidden dim
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.Sigmoid = nn.ReLU()
        
    def forward(self, x):
        
        # Initialize hidden state with zeros
        # self.layer_dim * 2. because we have one going forwards and another going backwards
        h0 = torch.randn(self.layer_dim * 2, x.size(0), self.hidden_dim, device=self.device)
        
        
        
        # Initialize cell state
        c0 =  torch.randn(self.layer_dim * 2, x.size(0), self.hidden_dim, device=self.device)
        
        # We suppose we are conducting a 28 time steps In case of using 
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm1(x, (h0.detach(), c0.detach()))
        # out = self.fc(out.view(out.size(0), -1))
          
        # Without the activation function, out will contain the last hidden layer.
        # This could be obtianed from hn[-1] as well.
        out = out[:, -1, :]
        
        out = self.fc(out)
        
        out = self.Sigmoid(out)
        
        return out
        
        # Index hidden state of last time step
        # out.size() --> 256, 100, 256 if we have (input dim = 100 and hidden dim = 100)
        # out[:, -1, :] => 256, 256 --> because we just want the last time step hidden states
        #out = out[:, -1, :] # without an activation function

        # now our: out.size() --> 256, 10 (if output dimension is equal to 10)
        #return out

In [3]:
# 1041 embedding size
model = BiLSTM(20)
model.to(device)
#model = BiLSTM(17)


# Class weights (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
#weights = torch.tensor([0.69,2.95,0.08,0.02,0.77,0.017,18.59,0 ,0.007,76.83], dtype=torch.float32)
#weights = weights / weights.sum()
#weights = 1.0 / weights
#weights = weights / weights.sum()


#loss_function = nn.NLLLoss()
#loss_function = nn.CrossEntropyLoss(weight=weights.to(device))
loss_function = nn.CrossEntropyLoss()
#loss_function = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [4]:
# A common headache in this competition is the lack of determinism in the results due to cudnn, the following solves the issue
def seed_everything(seed=10):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [5]:
def process_vision_output():
    for file in os.scandir('./vision_output'):
        if (os.path.isfile('./nlp_output/{0}'.format(file.name))):
            vision_out = []
            with open(file, "rb") as openfile:
                while True:
                    try:
                        vision_out.append(pickle.load(openfile))
                    except EOFError:
                        break

            processed_out = []

            for idk in vision_out:
                for segment in idk:
                    vector = segment[1]
                    for word in segment[0].split(' '):
                        processed_out.append([word, vector])

            processed_out = processed_out
            with open("./processed_vision_output/{0}".format(file.name), 'wb') as f:
                pickle.dump(processed_out, f)

In [9]:
vectors = []
labels = []
words = []

for file in os.scandir('./input'):
    with open(file, 'rb') as f:
        vs = pickle.load(f)
    
    for w in vs:
        words.append(w[0])
        #temp_t = torch.from_numpy(w[1]).float()
        temp_t = w[2].numpy()
        vectors.append(np.concatenate((temp_t, w[2])))
        labels.append(np.argmax(w[1]))
vectors = np.array(vectors)
labels = np.array(labels)

In [10]:
vectors.shape

(24201, 20)

In [11]:
labels.shape

(24201,)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2, random_state=42,stratify=labels)

In [13]:
x_train.shape

(19360, 20)

In [14]:
y_train.shape

(19360,)

In [15]:
x_test.shape

(4841, 20)

In [16]:
labels = torch.from_numpy(np.array(y_train, dtype=np.int64)).to(device)
v = torch.from_numpy(np.array(x_train, dtype=np.float64)).view(-1, 1, 20).float().to(device)
print(v.shape)
print(labels.shape)

torch.Size([19360, 1, 20])
torch.Size([19360])


In [18]:
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __getitem__(self,index):
        data,target = self.dataset[index]
        return data,target,index
    def __len__(self):
        return len(self.dataset)

In [31]:
# Training the model
batch_loss = 0
batch_size = 1041
epochs = 300

train = MyDataset(torch.utils.data.TensorDataset(v, labels))
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle=True)

avg_loss = 0

for epoch in range(epochs):
    
    running_loss = 0.0
    model.train()
    # Clear for the gradients.
    model.zero_grad()
    
    for i, (x_batch, y_batch, index) in enumerate(train_loader):
        label_scores = model(x_batch)
        # Calculate loss, backpropagate, and update weights/parameters by calling opt.step()
        loss = loss_function(label_scores, y_batch)
        # TODO: check this line
        #optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()/len(train_loader)
            
        print("Epoch: {0}/{3}. Loss: {2:.2f} Progress: {1}%".format(epoch, int((epoch * 100)/300), avg_loss / (epoch+1), epochs) , end="\r")
        
print("Bi-LSTM model training is done!                           ", end='\r')
print("final labels {0}".format(label_scores))

final labels tensor([[0.0000, 3.7200, 0.0000,  ..., 0.0000, 0.0000, 4.3625],
        [0.0000, 3.8946, 0.0000,  ..., 0.0000, 0.0000, 4.1656],
        [0.0000, 3.2895, 0.0000,  ..., 0.0000, 0.0000, 7.8864],
        ...,
        [0.0000, 2.6458, 0.0000,  ..., 0.0000, 0.0000, 7.3054],
        [0.0000, 2.9895, 0.0000,  ..., 0.0000, 0.0000, 5.9874],
        [0.0000, 2.7480, 0.0000,  ..., 0.0000, 0.0000, 7.9124]],
       device='cuda:0', grad_fn=<ReluBackward0>)


In [33]:
torch.save(model, "FinalModelV1")

In [39]:
labels

tensor([6, 9, 9,  ..., 9, 9, 9], device='cuda:0')

In [42]:
x_batch.shape

torch.Size([622, 1, 20])

In [38]:
label_scores.shape

torch.Size([622, 10])