In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import numpy as np
import pandas as pd
import pickle

import os

torch.manual_seed(1)

<torch._C.Generator at 0x7f0d6413fc70>

In [3]:
class BiLSTM(nn.Module): 
    # This NLP part Will consist of two bidirectional lstm layers and it's output is 
    # determined by the LSTM's last hidden states or output vectors.

    # This will take as an input a sequence of words and output the last hidden layer
    # the last hidden states of 2-layer bidirectional LSTM will be the input of the last multimodel network 

    def __init__(self, embedding_dim, hidden_dim = 256, layer_dim =2, output_dim = 7):
        super(BiLSTM, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        #Hidden dimensions
        self.hidden_dim = hidden_dim # maybe set this to 256

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building the LSTM 
        # batch_first = True causes the input/output to be of shape 3D (batch_dim, seq_dim, feature_dim) 
        # output will be the same dim as the hidden dim
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Initialize hidden state with zeros
        # self.layer_dim * 2. because we have one going forwards and another going backwards
        h0 = torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim)
        
        
        # Initialize cell state
        c0 =  torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim)

        # We suppose we are conducting a 28 time steps In case of using 
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm1(x, (h0.detach(), c0.detach()))
                
        # out = self.fc(out.view(out.size(0), -1))
          
        # Without the activation function, out will contain the last hidden layer.
        # This could be obtianed from hn[-1] as well.
        out = out[:, -1, :]
        
        out = self.fc(out)
        
        out = self.sigmoid(out)
        
        return out
        
        # Index hidden state of last time step
        # out.size() --> 256, 100, 256 if we have (input dim = 100 and hidden dim = 100)
        # out[:, -1, :] => 256, 256 --> because we just want the last time step hidden states
        #out = out[:, -1, :] # without an activation function

        # now our: out.size() --> 256, 10 (if output dimension is equal to 10)
        #return out

In [4]:
# 1041 embedding size
model = BiLSTM(1041)
#model = BiLSTM(17)


#loss_function = nn.NLLLoss()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

# Here i have to get the ELMO vector in a form, in a format of numpy array.
with open('document_vector.pickle', 'rb') as file:
    vectors = pickle.load(file)
    file.close()

vectors = np.delete(vectors, 1, 1)
vectors = np.delete(vectors, 0, 1)

vectors = torch.from_numpy(np.array(vectors, dtype=np.float64)).view(521, 1, -1).float()
vectors.size()

In [5]:
i = 0
for file in os.scandir('../../feature_extraction/annotated_features_Jenny/'):
    if file.name.endswith('csv'):
        if (file.name == 'ssoar_datasetssoar-journpsycho-1993-3-leithauser-geh_nicht_nach_haus_laudatio.csv'):
            continue
        dataframe = pd.read_csv(file)
        vector = dataframe.to_numpy()
        if (vector.shape[1] != 1046):
            continue
        target = vector[:, -1]        
        vector = vector[:, 4:-1]
        if(i == 0):
            vectors = vector
            targets = target
            i+=1
        else:
            targets = np.concatenate((targets, target))
            vectors = np.concatenate((vectors, vector))
        

In [6]:
for file in os.scandir('../../feature_extraction/annotated_features_225-301/'):
    if file.name.endswith('csv'):

        dataframe = pd.read_csv(file)
        vector = dataframe.to_numpy()
        if (vector.shape[1] != 1046):
            continue
        target = vector[:, -1]        
        vector = vector[:, 4:-1]
        targets = np.concatenate((targets, target))
        vectors = np.concatenate((vectors, vector))
        

In [7]:
for file in os.scandir('../../feature_extraction/annotated_features_151-225/'):
    if file.name.endswith('csv'):

        dataframe = pd.read_csv(file)
        vector = dataframe.to_numpy()
        if (vector.shape[1] != 1046):
            continue
        target = vector[:, -1]        
        vector = vector[:, 4:-1]
        targets = np.concatenate((targets, target))
        vectors = np.concatenate((vectors, vector))
        

In [8]:
targets

array([2, 2, 2, ..., 5, 5, 10], dtype=object)

In [9]:
set(targets)
t= targets

In [10]:
# 1, 2, 4=>3, 5=>4, 6=>5, 7=>6, 10=>7

for i in range(len(t)):
    if t[i] == 4:
        t[i] = 3
    elif t[i] == 5:
        t[i] = 4
    elif t[i] == 6:
        t[i] = 5
    elif t[i] == 7:
        t[i] = 6
    elif t[i] == 10:
        t[i] = 7
        
t

array([2, 2, 2, ..., 4, 4, 7], dtype=object)

In [11]:
v = vectors
t = t-1

In [12]:
v

array([[0.1, 1, 0, ..., -0.4808298349380493, 0.3247198760509491,
        -0.1913703680038452],
       [0.0, 0, 0, ..., -0.6039217114448547, 0.5757494568824768,
        -0.0578603632748127],
       [0.16666666666666666, 1, 0, ..., 0.3477407395839691,
        0.4615496397018433, -0.2093597650527954],
       ...,
       [0.6666666666666666, 1, 0, ..., -0.22227977216243744,
        -0.08376545459032059, -0.10069357603788376],
       [0.0, 0, 0, ..., 0.1951856166124344, -0.07286903262138368,
        0.2501157224178314],
       [0.0, 0, 0, ..., -0.2655101716518402, -0.02821296267211437,
        0.11155444383621216]], dtype=object)

In [13]:
set(t)

{0, 1, 2, 3, 4, 5, 6}

In [14]:
# PLEASE if you want to run this block you have to run the previous one first
labels = torch.from_numpy(np.array(t, dtype=np.int64))

print(v.shape)

v = torch.from_numpy(np.array(v, dtype=np.float64)).view(v.shape[0], 1, -1).float()
v.size()

(45438, 1041)


torch.Size([45438, 1, 1041])

In [44]:
# Training the model
batch_loss = 0
for epoch in range(300):
    
    running_loss = 0.0
    
    # Clear for the gradients.
    model.zero_grad()
        
    # here prepare the inputs and targets
    # target = array of labels [0, 1, ] where the label i stands for the class of the word i
    training_input = v
    target = labels
        
    # run a forward pass
    label_scores = model(training_input)
    # Calculate loss, backpropagate, and update weights/parameters by calling opt.step()
    loss = loss_function(label_scores, target)
    loss.backward()
    optimizer.step()
    
    
    running_loss += loss.item()
        
    print("Epoch: {0}/300. Loss: {2:.2f} Progress: {1}%".format(epoch, int((epoch * 100)/300), running_loss / (epoch+1)) , end="\r")
        

print("Bi-LSTM model training is done!                           ", end='\r')
print("final labels {0}".format(label_scores))

Epoch: 16/300. Loss: 0.11 Progress: 5%

KeyboardInterrupt: 

In [49]:
label_scores.size()

torch.Size([45438, 7])

In [50]:
y = []

for p in label_scores:
    i = 0
    m = 0
    for index, e in enumerate(p):
        if float(e) > m:
            i = index
            m = e
    y.append(i)

sigmoid = nn.Sigmoid()
testing = sigmoid(label_scores)

In [15]:
def accuracy(labels, predicted):
    total_accuracy = 0
    n = 0
    for y, p in zip(labels, predicted):
        if (y == p):
            total_accuracy +=1
        n+=1
    return total_accuracy/n

In [52]:
accuracy(labels, y)

0.8399357366081254

Model v1 = Feature vector all

Model v2 = Feature vector only layout

Model v3 = Feature vector all + sigmoid on training

Model v4 = Feature vector layout + sigmoid on training

In [53]:
torch.save(model, "modelv4")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [25]:
model = torch.load("modelv1")
model.eval()

BiLSTM(
  (lstm1): LSTM(1041, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=7, bias=True)
  (sigmoid): Sigmoid()
)

In [26]:
words_labels = model(v) 

In [19]:
words_labels

tensor([[0.3093, 0.7172, 0.1550,  ..., 0.1614, 0.7669, 0.9488],
        [0.3154, 0.7545, 0.1534,  ..., 0.1635, 0.7730, 0.9277],
        [0.3152, 0.7551, 0.1532,  ..., 0.1636, 0.7734, 0.9275],
        ...,
        [0.3124, 0.7509, 0.1514,  ..., 0.1623, 0.7783, 0.9298],
        [0.3122, 0.7508, 0.1512,  ..., 0.1624, 0.7785, 0.9299],
        [0.3147, 0.7247, 0.1603,  ..., 0.1658, 0.7601, 0.9383]],
       grad_fn=<SigmoidBackward>)

In [20]:
words_labels.size()

torch.Size([45438, 7])

In [58]:
y = []

for p in words_labels:
    i = 0
    m = 0
    for index, e in enumerate(p):
        if float(e) > m:
            i = index
            m = e
    y.append(i)

In [60]:
accuracy(labels, y)

0.8399357366081254

In [65]:
def precision_of(labels, y, of):
    n = 0
    precision = 0
    c = 0
    for label, p in zip(labels, y):
        if (label == p) and (p == of):
            c += 1
        if (p == of):
            n+=1
            
    precision = 0 if n==0 else c / n
    return precision

In [71]:
p0 = precision_of(labels, y, 0)
p1 = precision_of(labels, y, 1)
p2 = precision_of(labels, y, 2)
p3 = precision_of(labels, y, 3)
p4 = precision_of(labels, y, 4)
p5 = precision_of(labels, y, 5)
p6 = precision_of(labels, y, 6)

precision = (p0+p1+p2+p3+p4+p5+p6) / 7

In [73]:
precision

0.11999081951544648

In [78]:
def recall_of(labels, y, of):
    n = 0
    recall = 0
    c = 0
    for label, p in zip(labels, y):
        if (label == p) and (p == of):
            c += 1
        if (label == of):
            n+=1
            
    recall = 0 if n==0 else c / n
    return recall

In [80]:
p0 = recall_of(labels, y, 0)
p1 = recall_of(labels, y, 1)
p2 = recall_of(labels, y, 2)
p3 = recall_of(labels, y, 3)
p4 = recall_of(labels, y, 4)
p5 = recall_of(labels, y, 5)
p6 = recall_of(labels, y, 6)

recall = (p0+p1+p2+p3+p4+p5+p6) / 7

In [81]:
recall

0.14285714285714285