# Parameters

In [1]:
source_folder = '/content/drive/My Drive/Colab Notebooks/Datasets/binary/'
res_path = '/content/drive/My Drive/Colab Notebooks/Datasets/res/'

In [2]:
!pip install sklearn
!pip install netcal

Collecting netcal
[?25l  Downloading https://files.pythonhosted.org/packages/a8/a0/03ea56958564127b5b0bdf7b4cbaa1d3291c2a6ddb37735517f4906d202b/netcal-1.1.2-py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 3.8MB/s 
Installing collected packages: netcal
Successfully installed netcal-1.1.2


# Libraries

In [3]:
# Libraries
import os
import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator

# Models

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import precision_recall_fscore_support
from netcal.metrics import ECE

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

cuda:0
Mounted at /content/drive


# Preliminaries

In [5]:
# data 
training = '4_train_economic_news_binary.csv'
val = '4_val_economic_news_binary.csv'
testing = '4_test_economic_news_binary.csv'
logfile_name = "4-soft-lstm-economic_news-lr10-3-1&4cw_em500_dr02_maxEp500.csv"

num_classes = 2
class_weight = torch.Tensor([1, 4])
learning_rate = 1e-03
max_epochs = 500
batch_size = 32
embedding_size = 500
dropout_rate =0.2


# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)
fields = [('text', text_field), ('crowd_label', label_field), ('conf0', label_field), ('conf1', label_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, train=training, validation=val, test=testing,
                                           format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=1, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True)
test_iter = BucketIterator(test, batch_size=1, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True)

# Vocabulary
print(train)
text_field.build_vocab(train, min_freq=3)

<torchtext.data.dataset.TabularDataset object at 0x7f28f7c78d30>


# Models

In [6]:
#bidirectional LSTM model
class LSTM(nn.Module):

    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(len(text_field.vocab), embedding_size)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=embedding_size,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=dropout_rate)

        self.fc = nn.Linear(2*dimension, num_classes)

    def forward(self, text, text_len):

        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        text_fea = self.fc(text_fea)
        #text_fea = torch.squeeze(text_fea, 1)
        #text_out = torch.sigmoid(text_fea)

        return text_fea

  #  def forward(self, text, text_len):

  #      text_emb = self.embedding(text)

  #      packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
  #      packed_output, _ = self.lstm(packed_input)
  #      output, _ = pad_packed_sequence(packed_output, batch_first=True)

  #      out_forward = output[range(len(output)), text_len - 1, :self.dimension]
  #      out_reverse = output[:, 0, self.dimension:]
  #      out_reduced = torch.cat((out_forward, out_reverse), 1)
  #      text_fea = self.drop(out_reduced)

  #      text_fea = self.fc(text_fea)
  #      text_fea = torch.squeeze(text_fea, 1)
  #      text_out = torch.sigmoid(text_fea)

  #      return text_out

# Training

In [7]:
def soft_labels(crowd_labelList, conf0, conf1):
    y = np.column_stack((conf0.tolist(), conf1.tolist()))
    return y

class CrossEntropyLossSoft(nn.Module):

    def __init__(self, weight=None):
        super(CrossEntropyLossSoft, self).__init__()
        self.weight = weight

    def forward(self, pred, soft_targets):
        logsoftmax = nn.LogSoftmax()
        if self.weight is not None:
            return torch.mean(torch.sum(- soft_targets * self.weight * logsoftmax(pred), 1))
        else:
            return torch.mean(torch.sum(- soft_targets * logsoftmax(pred), 1))

def ece_score(y_true, y_prob, n_bins=10):
    ece = ECE(n_bins)
    ece_val = ece.measure(y_prob, y_true)

    return ece_val


def compute_val():
    loss_function = nn.CrossEntropyLoss()
    with torch.no_grad():
        model.eval()
        y_pred = []
        output_prob_val = []
        output_logits_val = []
        y_val_hard = []
        
        for ((text, text_len), labels, conf0, conf1), _ in valid_iter: 
            y_val_hard.append(int(labels.item()))
            sent = text.to(device)
            sent_len = text_len.to(device)
            label = labels.to(device)
            output = model.forward(sent, text_len)
            logit, predicted = torch.max(output.data, 1)
            output_logits_val.append(output[0].cpu().tolist())
            output_prob_val.append(torch.sigmoid(output[0]).cpu().tolist())
            y_pred.append(predicted.item())
        loss_val = loss_function(torch.Tensor(output_logits_val), torch.LongTensor(y_val_hard)).item()
        model.train()
        ece_val = ece_score(np.array(y_val_hard), np.array(output_prob_val))

        # check if binary or multi class classification
        num_classes = len(set(y_val_hard))
        if num_classes == 2:
            average = 'binary'
        else:
            average = 'macro'
        pre_val, rec_val, f1_val, _ = precision_recall_fscore_support(y_val_hard, y_pred, average=average, beta=1)
        _, _, f01_val, _ = precision_recall_fscore_support(y_val_hard, y_pred, average=average, beta=0.1)
        _, _, f10_val, _ = precision_recall_fscore_support(y_val_hard, y_pred, average=average, beta=10)
        print('Iteration: {}. Train Loss: {:1.5f}. Val Loss: {:1.5f}, F1: {:1.3f}, ECE: {:1.3f}, Precision: {:1.3f}, Recall: {:1.3f}'.
            format(i, loss.item(), loss_val, f1_val, ece_val, pre_val, rec_val))
        # print to result file
        with open(res_path, 'a') as f:
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(epoch, i, loss.item(), loss_val, pre_val, rec_val,
                                                                      f01_val, f1_val, f10_val, ece_val)
            f.write(res_i)

In [8]:
# create log file
res_path += logfile_name
with open(res_path, 'w') as f:
    c = 'epoch, iter, loss_train, loss_val, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val'
    f.write(c + '\n')

model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = CrossEntropyLossSoft(weight=class_weight.to(device))
train_loader = train_iter
valid_loader = valid_iter

# training loop
model.train()
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    i = 0
    for ((text, text_len), labels, conf0, conf1), _ in train_loader:   
        optimizer.zero_grad()
        modified_labels = soft_labels(labels, conf0, conf1)
        labels = torch.FloatTensor(modified_labels)
        labels = labels.to(device)

        text = text.to(device)
        text_len = text_len.to(device)
        output = model.forward(text, text_len)

        loss = loss_function(output, labels)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
          compute_val()

        i = i + 1

EPOCH -- 0


  


Iteration: 0. Train Loss: 0.96054. Val Loss: 0.70204, F1: 0.484, ECE: 0.033, Precision: 0.495, Recall: 0.473
Iteration: 100. Train Loss: 1.07394. Val Loss: 0.64151, F1: 0.672, ECE: 0.056, Precision: 0.569, Recall: 0.819
EPOCH -- 1
Iteration: 0. Train Loss: 0.70336. Val Loss: 0.61778, F1: 0.680, ECE: 0.108, Precision: 0.641, Recall: 0.726
Iteration: 100. Train Loss: 0.95525. Val Loss: 0.62379, F1: 0.660, ECE: 0.060, Precision: 0.635, Recall: 0.686
EPOCH -- 2
Iteration: 0. Train Loss: 0.57218. Val Loss: 0.61734, F1: 0.671, ECE: 0.079, Precision: 0.709, Recall: 0.637
Iteration: 100. Train Loss: 0.72264. Val Loss: 0.65650, F1: 0.637, ECE: 0.020, Precision: 0.667, Recall: 0.611
EPOCH -- 3
Iteration: 0. Train Loss: 0.41499. Val Loss: 0.69267, F1: 0.598, ECE: 0.037, Precision: 0.676, Recall: 0.535
Iteration: 100. Train Loss: 0.63253. Val Loss: 0.73192, F1: 0.624, ECE: 0.049, Precision: 0.708, Recall: 0.558
EPOCH -- 4
Iteration: 0. Train Loss: 0.40392. Val Loss: 0.70966, F1: 0.603, ECE: 0.065,