## Configuration

In [0]:
USE_TRAIN_AND_DEV = False # train with train set, evaluate with dev set
USE_TRAIN_AND_DEV_AND_TEST = True # train with train and dev sets, evaluate with test set

USE_LINEAR_LAYER = True
USE_GRU_LAYER = False
USE_BIGRU_LAYER = False
USE_LSTM_LAYER = False
USE_BILSTM_LAYER = False
USE_BIGRU_LAYER_AND_DISTILBERT = False

LINEAR_SIZE = 128
GRU_SIZE = 128
BIGRU_SIZE = 128
LSTM_SIZE = 128
BILSTM_SIZE = 128

MAX_LEN = 21
bs = 256
LEARNING_RATE = 3e-4
NUM_EPOCHS = 30
NUM_TOKENS = 2373

## Importing libraries & defining auxiliary functions

In [0]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/'My Drive'/Thesis
!ls

import pandas as pd
import numpy as np
import math
from tqdm import tqdm, trange
import pickle

! pip install pytorch-transformers
from pytorch_transformers.modeling_distilbert import *


In [0]:
import numpy as np
import pandas as pd
import re
import glob
from copy import deepcopy
from pandas.io.json import json_normalize
from evalfixed import *
import seaborn as sns
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.nn import BCELoss

sns.set_style('whitegrid')
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 300

%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [0]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import DistilBertTokenizer, DistilBertConfig
from pytorch_transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(n_gpu)
print(torch.cuda.get_device_name(0))

In [0]:
! pip install seqeval
from seqeval.metrics import f1_score

In [0]:
def flat_accuracy(preds, labels):
    preds_rounded = [0]*len(preds)

    for i in range(len(preds)):
        if preds[i] >= 0.5:
            preds_rounded[i] = 1
        else:
            preds_rounded[i] = 0
            
    return np.sum(np.array(preds_rounded) == np.array(labels)) / len(preds_rounded)

In [0]:
flatten = lambda l: [item for sublist in l for item in sublist]

def compute_f1(actual, predicted):
    """
    Computes the F1 score of your predictions. Note that we use 0.5 as the cutoff here.
    """
    num = len(actual)

    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0

    for i in range(num):
        if actual[i] >= 0.5 and predicted[i] >= 0.5:
            true_positives += 1
        elif actual[i] < 0.5 and predicted[i] >= 0.5:
            false_positives += 1
        elif actual[i] >= 0.5 and predicted[i] < 0.5:
            false_negatives += 1
        else:
            true_negatives += 1

    try:
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        F1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        F1 = 0.0

    return F1

## Defining models

In [0]:
class MyCustomDistilBertForTokenClassification(DistilBertPreTrainedModel):
  
  def __init__(self, config):
        super(MyCustomDistilBertForTokenClassification, self).__init__(config)
                  
        
        config.output_hidden_states=True # output all hidden states from all layers of DistilBert
        self.output_hidden_states = True
        
        self.num_labels = config.num_labels
        assert config.num_labels == 2
        self.distilbert = DistilBertModel(config)

        self.dropout = nn.Dropout(0.5)
        self.dropout3 = nn.Dropout(0.3)
        self.init_weights()
        self.relu = nn.ReLU()
        
        self.embedding_userID = nn.Embedding(num_embeddings=2593, embedding_dim=128)
        self.embedding_format = nn.Embedding(num_embeddings=3, embedding_dim=64)
        self.embedding_token = nn.Embedding(num_embeddings=NUM_TOKENS , embedding_dim=256)
        
        self.embedding_country = nn.Embedding(num_embeddings= 37, embedding_dim=64)
        self.embedding_client = nn.Embedding(num_embeddings= 3, embedding_dim=64)
        self.embedding_session = nn.Embedding(num_embeddings= 3, embedding_dim=64)
        

        if USE_LINEAR_LAYER:
          self.linear = nn.Linear(256+128+1*2+64*4, LINEAR_SIZE)
          self.linear_layer_2 = nn.Linear(LINEAR_SIZE, 32)

        elif USE_GRU_LAYER:
          self.gru_layer = nn.GRU(256+128+1*2+64*4, hidden_size=GRU_SIZE, num_layers=1, batch_first=True, bidirectional=False)
          self.linear_layer_2 = nn.Linear(GRU_SIZE, 32)
          
        elif USE_BIGRU_LAYER:
          self.bigru_layer = nn.GRU(256+128+1*2+64*4, hidden_size=BIGRU_SIZE, num_layers=1, batch_first=True, bidirectional=True)
          self.linear_layer_2 = nn.Linear(BIGRU_SIZE*2, 32)

        elif USE_LSTM_LAYER:
          self.lstm_layer = nn.LSTM(256+128+1*2+64*4, hidden_size=LSTM_SIZE, num_layers=1, batch_first=True, bidirectional=False)
          self.linear_layer_2 = nn.Linear(LSTM_SIZE, 32)

        elif USE_BILSTM_LAYER:
          self.bilstm_layer = nn.LSTM(256+128+1*2+64*4, hidden_size=BILSTM_SIZE, num_layers=1, batch_first=True, bidirectional=True)
          self.linear_layer_2 = nn.Linear(BILSTM_SIZE*2, 32)

        elif USE_BIGRU_LAYER_AND_DISTILBERT:
          self.gru_layer = nn.GRU(256*2+256+128+1*2+64*4, hidden_size=256, num_layers=1, batch_first=True, bidirectional=True)
          self.bert_gru = nn.GRU(config.hidden_size, hidden_size=256, num_layers=1, batch_first=True, bidirectional=True)
          self.linear_layer_2 = nn.Linear(512, 32)


        self.classifier = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, unique_exids, token, user_ids, ex_format, ex_days, country, client, session, time, token_type_ids=None, attention_mask=None, model_mask=None, head_mask=None, labels=None):

        token_embedding = self.embedding_token(token)

        user_ids = self.embedding_userID(user_ids)
        ex_format = self.embedding_format(ex_format)
        ex_days = ex_days[:,:,None]
        time = time[:,:,None]
        country = self.embedding_country(country)
        client = self.embedding_client(client)
        session = self.embedding_session(session)


        if USE_LINEAR_LAYER:            
            concat_output = torch.cat((token_embedding, user_ids, ex_format, ex_days, country, client, session, time),2)
            concat_output = self.linear(concat_output)

        elif USE_LSTM_LAYER:            
            concat_output = torch.cat((token_embedding, user_ids, ex_format, ex_days, country, client, session, time),2)
            concat_output, hidden = self.lstm(concat_output)

        elif USE_BILSTM_LAYER:            
            concat_output = torch.cat((token_embedding, user_ids, ex_format, ex_days, country, client, session, time),2)
            concat_output, hidden = self.bilstm(concat_output)
          
        elif USE_GRU_LAYER:
            concat_output = torch.cat((token_embedding, user_ids, ex_format, ex_days, country, client, session, time),2)
            concat_output, hidden = self.gru_layer(concat_output)

        elif USE_BIGRU_LAYER:
            concat_output = torch.cat((token_embedding, user_ids, ex_format, ex_days, country, client, session, time),2)
            concat_output, hidden = self.bigru_layer(concat_output)

        elif USE_BIGRU_LAYER_AND_DISTILBERT:
            bert_output = self.distilbert(input_ids, token_type_ids, head_mask=None)
            bert_output = self.dropout3(bert_output[0]) # use to extract the last layer

            #concat_bert = torch.cat((bert_output[-1][-1],bert_output[-1][-2]),2) # use to extract (Distil)BERT layers
            bert_output, _ = self.bert_gru(bert_output)
            
            concat_output = torch.cat((bert_output, token_embedding, user_ids, ex_format, ex_days, country, client, session, time),2)


        concat_output = self.dropout(concat_output)
        concat_output = self.linear_layer_2(concat_output)
        concat_output = self.relu(concat_output)
        concat_output = self.dropout(concat_output)
        
        logits = self.classifier(concat_output)
        logits = self.sigmoid(logits)      
        

        if labels is not None:
            loss_fct = BCELoss()
            # Only keep active parts of the loss
            if model_mask is not None:
                active_loss = model_mask.view(-1) == 1
                active_logits = logits.view(-1)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                
                loss = loss_fct(active_logits, active_labels.float())
            else:
                loss = loss_fct(logits.view(-1, 1), labels.view(-1))
            return loss
        else:
            return logits

## Importing data

In [0]:
trn = pd.read_pickle('trn.pkl')
trn['exercise_id'] = trn['instance_id'].str[:-2]
trn['days'] = (trn['days'] - np.mean(trn['days'])) / np.std(trn['days'])
trn['time'].values[trn['time'] > 100] = 100
trn['time'] = (trn['time'] - np.mean(trn['time'])) / np.std(trn['time'])

dev = pd.read_pickle('dev.pkl')
test = pd.read_pickle('test.pkl')

dev['exercise_id'] = dev['instance_id'].str[:-2]
test['exercise_id'] = test['instance_id'].str[:-2]

dev['days'] = (dev['days'] - np.mean(dev['days'])) / np.std(dev['days'])
test['days'] = (test['days'] - np.mean(test['days'])) / np.std(test['days'])

dev['time'].values[dev['time'] > 100] = 100
dev['time'] = (dev['time'] - np.mean(dev['time'])) / np.std(dev['time'])
test['time'].values[test['time'] > 100] = 100
test['time'] = (test['time'] - np.mean(test['time'])) / np.std(test['time'])

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, q, r, t, v, n, l, f) for w, p, q, r, t, v, n, l, f in zip(s["token"].values.tolist(),
                                                                 s["label"].values.tolist(),
                                                                 s["user"].values.tolist(),
                                                                 s["format"].values.tolist(),
                                                                 s["days"].values.tolist(),
                                                                 s["countries"].values.tolist(),
                                                                 s["client"].values.tolist(),
                                                                 s["session"].values.tolist(),
                                                                 s["time"].values.tolist())]
        self.grouped = self.data.groupby("exercise_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter_trn = SentenceGetter(trn)
getter_dev = SentenceGetter(dev)
getter_test = SentenceGetter(test)

In [0]:
tokens_trn = [["[CLS]"] + [s[0] for s in sent] + ["[SEP]"] for sent in getter_trn.sentences]
sentences_trn = ["[CLS] " + " ".join([s[0] for s in sent]) + " [SEP]" for sent in getter_trn.sentences]
labels_trn = [[0.0] + [s[1] for s in sent] + [0.0] for sent in getter_trn.sentences]
userIDs_trn = [[sent[0][2]] + [s[2] for s in sent] + [sent[0][2]] for sent in getter_trn.sentences]
format_trn = [[sent[0][3]] + [s[3] for s in sent] + [sent[0][3]] for sent in getter_trn.sentences]
days_trn = [[sent[0][4]] + [s[4] for s in sent] + [sent[0][4]] for sent in getter_trn.sentences]
country_trn = [[sent[0][5]] + [s[5] for s in sent] + [sent[0][5]] for sent in getter_trn.sentences]
client_trn = [[sent[0][6]] + [s[6] for s in sent] + [sent[0][6]] for sent in getter_trn.sentences]
session_trn = [[sent[0][7]] + [s[7] for s in sent] + [sent[0][7]] for sent in getter_trn.sentences]
time_trn = [[sent[0][8]] + [s[8] for s in sent] + [sent[0][8]] for sent in getter_trn.sentences]


tokens_dev = [["[CLS]"] + [s[0] for s in sent] + ["[SEP]"] for sent in getter_dev.sentences]
tokens_test = [["[CLS]"] + [s[0] for s in sent] + ["[SEP]"] for sent in getter_test.sentences]
print(tokens_dev[155])

sentences_dev = ["[CLS] " + " ".join([s[0] for s in sent]) + " [SEP]" for sent in getter_dev.sentences]
sentences_test = ["[CLS] " + " ".join([s[0] for s in sent]) + " [SEP]" for sent in getter_test.sentences]
print(sentences_dev[155])

labels_dev = [[0.0] + [s[1] for s in sent] + [0.0] for sent in getter_dev.sentences]
labels_test = [[0.0] + [s[1] for s in sent] + [0.0] for sent in getter_test.sentences]
print(labels_dev[155])

userIDs_dev = [[sent[0][2]] + [s[2] for s in sent] + [sent[0][2]] for sent in getter_dev.sentences]
userIDs_test = [[sent[0][2]] + [s[2] for s in sent] + [sent[0][2]] for sent in getter_test.sentences]
print(userIDs_dev[155])

format_dev = [[sent[0][3]] + [s[3] for s in sent] + [sent[0][3]] for sent in getter_dev.sentences]
format_test = [[sent[0][3]] + [s[3] for s in sent] + [sent[0][3]] for sent in getter_test.sentences]
print(format_dev[155])

days_dev = [[sent[0][4]] + [s[4] for s in sent] + [sent[0][4]] for sent in getter_dev.sentences]
days_test = [[sent[0][4]] + [s[4] for s in sent] + [sent[0][4]] for sent in getter_test.sentences]
print(days_dev[155])

country_dev = [[sent[0][5]] + [s[5] for s in sent] + [sent[0][5]] for sent in getter_dev.sentences]
country_test = [[sent[0][5]] + [s[5] for s in sent] + [sent[0][5]] for sent in getter_test.sentences]
print(country_dev[155])

client_dev = [[sent[0][6]] + [s[6] for s in sent] + [sent[0][6]] for sent in getter_dev.sentences]
client_test = [[sent[0][6]] + [s[6] for s in sent] + [sent[0][6]] for sent in getter_test.sentences]
print(client_dev[155])

session_dev = [[sent[0][7]] + [s[7] for s in sent] + [sent[0][7]] for sent in getter_dev.sentences]
session_test = [[sent[0][7]] + [s[7] for s in sent] + [sent[0][7]] for sent in getter_test.sentences]
print(session_dev[155])

time_dev = [[sent[0][8]] + [s[8] for s in sent] + [sent[0][8]] for sent in getter_dev.sentences]
time_test = [[sent[0][8]] + [s[8] for s in sent] + [sent[0][8]] for sent in getter_test.sentences]
print(time_dev[155])


In [0]:
all_sentences = sentences_trn + sentences_dev + sentences_test

#assigning an index to each unique sentence
sentence_corresponding_to_exercise = pd.factorize(all_sentences)[0]
sentence_dictionary = list(pd.factorize(all_sentences)[1])

#healthcheck
print(all_sentences[34573])
print(sentence_dictionary[sentence_corresponding_to_exercise[34573]])

unique_ex_id_trn = [[sentence_corresponding_to_exercise[i]] + [sentence_corresponding_to_exercise[i] for s in sent] for i, sent in enumerate(getter_trn.sentences)]
unique_ex_id_dev = [[sentence_corresponding_to_exercise[i+len(unique_ex_id_trn)]] + [sentence_corresponding_to_exercise[i+len(unique_ex_id_trn)] for s in sent] for i, sent in enumerate(getter_dev.sentences)]
unique_ex_id_test = [[sentence_corresponding_to_exercise[i+len(unique_ex_id_trn)+len(unique_ex_id_dev)]] + [sentence_corresponding_to_exercise[i+len(unique_ex_id_trn)+len(unique_ex_id_dev)] for s in sent] for i, sent in enumerate(getter_test.sentences)]

In [0]:
#healthcheck

x, y, z = 3457, 4258, 435

print(unique_ex_id_trn[x])
print(unique_ex_id_dev[y])
print(unique_ex_id_test[z])

print(sentences_trn[x])
print(sentence_dictionary[unique_ex_id_trn[x][0]])

print(sentences_dev[y])
print(sentence_dictionary[unique_ex_id_dev[y][0]])

print(sentences_test[z])
print(sentence_dictionary[unique_ex_id_test[z][0]])


In [0]:
## Get unique userIDs in order to calculate the embedding layer size

unique_ids = set([u[0] for u in [[s[2] for s in sent] for sent in getter_trn.sentences]])

  
## Creating dictionaries for the variables

#tokens
if USE_TRAINING_DATA:
    tokens_vals = ["[CLS]"] + ["[SEP]"] + list(set(list(set(trn["token"].values)) + list(set(dev["token"].values)) + list(set(test["token"].values))))
else:
    tokens_vals = ["[CLS]"] + ["[SEP]"] + list(set(list(set(test["token"].values)) + list(set(dev["token"].values))))
token2idx = {t: i for i, t in enumerate(tokens_vals)}

#labels
tags_vals = list(set(dev["label"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

#userID
userid2idx = {u: i for i,u in enumerate(unique_ids)}

#format
format_vals = list(set(dev["format"].values))
format2idx = {t: i for i, t in enumerate(format_vals)}

#country
country_vals = list(set(dev["countries"].values))
country2idx = {t: i for i, t in enumerate(country_vals)}

#client
client_vals = list(set(dev["client"].values))
client2idx = {t: i for i, t in enumerate(client_vals)}

#session
session_vals = list(set(dev["session"].values))
session2idx = {t: i for i, t in enumerate(session_vals)}

In [0]:
print("Vocabulary size:")
len(tokens_vals)

In [0]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [0]:
tokenized_texts_trn = [tokenizer.tokenize(sent) for sent in sentences_trn]
input_ids_trn = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_trn],
                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tokenized_texts_dev = [tokenizer.tokenize(sent) for sent in sentences_dev]
input_ids_dev = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_dev],
                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tokenized_texts_test = [tokenizer.tokenize(sent) for sent in sentences_test]
input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_test],
                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

print(tokenized_texts_dev[0])
print(input_ids_dev[0])

input_tokens_trn = [[token2idx.get(t) for t in token] for token in tokens_trn]
input_tokens_trn = pad_sequences(input_tokens_trn,
                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_tokens_dev = [[token2idx.get(t) for t in token] for token in tokens_dev]
input_tokens_dev = pad_sequences(input_tokens_dev,
                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_tokens_test = [[token2idx.get(t) for t in token] for token in tokens_test]
input_tokens_test = pad_sequences(input_tokens_test,
                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

print(input_tokens_dev[0])

input_userids_trn = [[userid2idx.get(u) for u in uid] for uid in userIDs_trn]
input_userids_trn = [[user[0]]*MAX_LEN for user in input_userids_trn]
input_userids_dev = [[userid2idx.get(u) for u in uid] for uid in userIDs_dev]
input_userids_dev = [[user[0]]*MAX_LEN for user in input_userids_dev]
input_userids_test = [[userid2idx.get(u) for u in uid] for uid in userIDs_test]
input_userids_test = [[user[0]]*MAX_LEN for user in input_userids_test]

print(input_userids_dev[0])

input_format_trn = [[format2idx.get(u) for u in uid] for uid in format_trn]
input_format_trn = [[user[0]]*MAX_LEN for user in input_format_trn]
input_format_dev = [[format2idx.get(u) for u in uid] for uid in format_dev]
input_format_dev = [[user[0]]*MAX_LEN for user in input_format_dev]
input_format_test = [[format2idx.get(u) for u in uid] for uid in format_test]
input_format_test = [[user[0]]*MAX_LEN for user in input_format_test]

print(input_format_dev[0])

input_days_trn = [[days[0]]*MAX_LEN for days in days_trn]
input_days_dev = [[days[0]]*MAX_LEN for days in days_dev]
input_days_test = [[days[0]]*MAX_LEN for days in days_test]

print(input_days_dev[0])

input_country_trn = [[country2idx.get(l) for l in elem] for elem in country_trn]
input_country_trn = [[elem[0]]*MAX_LEN for elem in input_country_trn]
input_country_dev = [[country2idx.get(l) for l in elem] for elem in country_dev]
input_country_dev = [[elem[0]]*MAX_LEN for elem in input_country_dev]
input_country_test = [[country2idx.get(l) for l in elem] for elem in country_test]
input_country_test = [[elem[0]]*MAX_LEN for elem in input_country_test]

print(input_country_dev[0])

input_client_trn = [[client2idx.get(l) for l in elem] for elem in client_trn]
input_client_trn = [[elem[0]]*MAX_LEN for elem in input_client_trn]
input_client_dev = [[client2idx.get(l) for l in elem] for elem in client_dev]
input_client_dev = [[elem[0]]*MAX_LEN for elem in input_client_dev]
input_client_test = [[client2idx.get(l) for l in elem] for elem in client_test]
input_client_test = [[elem[0]]*MAX_LEN for elem in input_client_test]

print(input_client_dev[0])

input_session_trn = [[session2idx.get(l) for l in elem] for elem in session_trn]
input_session_trn = [[elem[0]]*MAX_LEN for elem in input_session_trn]
input_session_dev = [[session2idx.get(l) for l in elem] for elem in session_dev]
input_session_dev = [[elem[0]]*MAX_LEN for elem in input_session_dev]
input_session_test = [[session2idx.get(l) for l in elem] for elem in session_test]
input_session_test = [[elem[0]]*MAX_LEN for elem in input_session_test]

print(input_session_dev[0])

input_time_trn = [[elem[0]]*MAX_LEN for elem in time_trn]
input_time_dev = [[elem[0]]*MAX_LEN for elem in time_dev]
input_time_test = [[elem[0]]*MAX_LEN for elem in time_test]

print(input_time_dev[0])

input_unique_exid_trn = [[exid[0]]*MAX_LEN for exid in unique_ex_id_trn]
input_unique_exid_dev = [[exid[0]]*MAX_LEN for exid in unique_ex_id_dev]
input_unique_exid_test = [[exid[0]]*MAX_LEN for exid in unique_ex_id_test]

print(input_unique_exid_dev[0])

In [0]:
# unique sentences prep

tokenized_unique_sentences = [tokenizer.tokenize(sent) for sent in sentence_dictionary]
input_ids_unique_sentences = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_unique_sentences],
                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks_unique_sentences = [[float(i>0) and float(i!=101) and float(i!=102) for i in ii] for ii in input_ids_unique_sentences]

unique_sent_inputs = torch.tensor(input_ids_unique_sentences)
unique_sent_masks = torch.tensor(attention_masks_unique_sentences)

unique_sent_data = TensorDataset(unique_sent_inputs, unique_sent_masks)
unique_sent_sampler = SequentialSampler(unique_sent_data)
unique_sent_dataloader = DataLoader(unique_sent_data, sampler=unique_sent_sampler, batch_size=bs)



In [0]:
tags_trn = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_trn],
                     maxlen=MAX_LEN, value=0.0, padding="post",
                     dtype="long", truncating="post")
tags_dev = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_dev],
                     maxlen=MAX_LEN, value=0.0, padding="post",
                     dtype="long", truncating="post")
tags_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_test],
                     maxlen=MAX_LEN, value=0.0, padding="post",
                     dtype="long", truncating="post")

In [0]:
model_masks_trn = [[float(i>1) for i in ii] for ii in input_tokens_trn] # 0 and 1 are CLS and SEP
model_masks_dev = [[float(i>1) for i in ii] for ii in input_tokens_dev]
model_masks_test = [[float(i>1) for i in ii] for ii in input_tokens_test]

attention_masks_trn = [[float(i>0) for i in ii] for ii in input_ids_trn] 
attention_masks_dev = [[float(i>0) for i in ii] for ii in input_ids_dev]
attention_masks_test = [[float(i>0) for i in ii] for ii in input_ids_test]

In [0]:
# # MERGE TRAIN AND DEV SETS 

input_ids_trndev = np.concatenate((input_ids_trn,input_ids_dev))
input_unique_exid_trndev = np.concatenate((input_unique_exid_trn,input_unique_exid_dev))
input_tokens_trndev = np.concatenate((input_tokens_trn,input_tokens_dev))
input_userids_trndev = np.concatenate((input_userids_trn,input_userids_dev))
input_format_trndev = np.concatenate((input_format_trn,input_format_dev))
input_days_trndev = np.concatenate((input_days_trn,input_days_dev))
input_country_trndev = np.concatenate((input_country_trn,input_country_dev))
input_client_trndev = np.concatenate((input_client_trn,input_client_dev))
input_session_trndev = np.concatenate((input_session_trn,input_session_dev))
input_time_trndev = np.concatenate((input_time_trn,input_time_dev))
tags_trndev = np.concatenate((tags_trn,tags_dev))
attention_masks_trndev = np.concatenate((attention_masks_trn,attention_masks_dev))
model_masks_trndev = np.concatenate((model_masks_trn,model_masks_dev))

In [0]:
if USE_TRAIN_AND_DEV:
    tr_inputs = torch.tensor(input_ids_trn)
    val_inputs = torch.tensor(input_ids_dev)
    tr_unique_exid_inputs = torch.tensor(input_unique_exid_trn)
    val_unique_exid_inputs = torch.tensor(input_unique_exid_dev)
    tr_tokens = torch.tensor(input_tokens_trn)
    val_tokens = torch.tensor(input_tokens_dev)
    tr_userids = torch.tensor(input_userids_trn)
    val_userids = torch.tensor(input_userids_dev)
    tr_format = torch.tensor(input_format_trn)
    val_format = torch.tensor(input_format_dev)
    tr_days = torch.tensor(input_days_trn)
    val_days = torch.tensor(input_days_dev)
    tr_country = torch.tensor(input_country_trn)
    val_country = torch.tensor(input_country_dev)
    tr_client = torch.tensor(input_client_trn)
    val_client = torch.tensor(input_client_dev)
    tr_session = torch.tensor(input_session_trn)
    val_session = torch.tensor(input_session_dev)
    tr_time = torch.tensor(input_time_trn)
    val_time = torch.tensor(input_time_dev)
    tr_tags = torch.tensor(tags_trn)
    val_tags = torch.tensor(tags_dev)
    tr_masks = torch.tensor(attention_masks_trn)
    val_masks = torch.tensor(attention_masks_dev)
    tr_model_masks = torch.tensor(model_masks_trn)
    val_model_masks = torch.tensor(model_masks_dev)
    
elif USE_TRAIN_AND_DEV_AND_TEST:
    tr_inputs = torch.tensor(input_ids_trndev)
    val_inputs = torch.tensor(input_ids_test)
    tr_unique_exid_inputs = torch.tensor(input_unique_exid_trndev)
    val_unique_exid_inputs = torch.tensor(input_unique_exid_test)
    tr_tokens = torch.tensor(input_tokens_trndev)
    val_tokens = torch.tensor(input_tokens_test)
    tr_userids = torch.tensor(input_userids_trndev)
    val_userids = torch.tensor(input_userids_test)
    tr_format = torch.tensor(input_format_trndev)
    val_format = torch.tensor(input_format_test)
    tr_days = torch.tensor(input_days_trndev)
    val_days = torch.tensor(input_days_test)
    tr_country = torch.tensor(input_country_trndev)
    val_country = torch.tensor(input_country_test)
    tr_client = torch.tensor(input_client_trndev)
    val_client = torch.tensor(input_client_test)
    tr_session = torch.tensor(input_session_trndev)
    val_session = torch.tensor(input_session_test)
    tr_time = torch.tensor(input_time_trndev)
    val_time = torch.tensor(input_time_test)
    tr_tags = torch.tensor(tags_trndev)
    val_tags = torch.tensor(tags_test)
    tr_masks = torch.tensor(attention_masks_trndev)
    val_masks = torch.tensor(attention_masks_test)
    tr_model_masks = torch.tensor(model_masks_trndev)
    val_model_masks = torch.tensor(model_masks_test)  

train_data = TensorDataset(tr_inputs, tr_unique_exid_inputs, tr_tokens, tr_userids, tr_format, tr_days, tr_country, tr_client, tr_session, tr_time, tr_masks, tr_model_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_unique_exid_inputs, val_tokens, val_userids, val_format, val_days, val_country, val_client, val_session, val_time, val_masks, val_model_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

## Train model

In [0]:
model = MyCustomDistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(tag2idx), output_hidden_states=True)

In [0]:
model

In [0]:
# Healthcheck: compare tensors before & after training to check whether they have been updated

before_embedding_userid = model.embedding_userID.weight.clone()
before_embedding_format = model.embedding_format.weight.clone()
before_embedding_token = model.embedding_token.weight.clone()
before_embedding_bert_gru = model.bert_gru.all_weights

In [0]:
# Healthcheck: print layer 3

for param in model.distilbert.transformer.layer[3].named_parameters():
    print(param)

In [0]:
# Healthcheck: print layer 5

for param in model.distilbert.transformer.layer[5].named_parameters():
    print(param)

In [0]:
# Finetuning all parameters

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
]

optimizer = Adam(optimizer_grouped_parameters, lr=LEARNING_RATE)

In [0]:
# SELECT LAYERS WITHOUT FINETUNING

for l in range(6):
    for param in model.distilbert.transformer.layer[l].parameters():
        param.requires_grad = False
        
for param in model.distilbert.embeddings.parameters():
    param.requires_grad = False

In [0]:
model.cuda();

epochs = NUM_EPOCHS
max_grad_norm = 1.0

plot_train_loss = []
plot_train_acc = []
plot_val_loss = []
plot_val_acc = []
plot_val_auc = []
plot_val_f1 = []

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, b_input_mask, b_model_mask, b_labels = batch

        # forward pass
        loss = model(b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days.float(), b_country, b_client, b_session, b_time.float(), token_type_ids=None, # .float() to fix error message "float expected, got double"
                     attention_mask=b_input_mask, model_mask=b_model_mask, head_mask=None, labels=b_labels)
        
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    plot_train_loss = plot_train_loss + [tr_loss/nb_tr_steps]
    
    
    # VALIDATION 
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels, actual_labels = [], [], []
    
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, b_input_mask, b_model_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, token_type_ids=None,
                                  attention_mask=b_input_mask, model_mask=b_model_mask, head_mask=None, labels=b_labels)
            logits = model(b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, token_type_ids=None,
                           attention_mask=b_input_mask, model_mask=b_model_mask, head_mask=None)
            
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        model_mask = b_model_mask.to('cpu').numpy()
    
        true_labels.append(label_ids)
        
        flattened_masks = flatten(model_mask)
        flattened_labels = flatten(label_ids)
        flattened_logits = flatten(flatten(logits))

        zipped_vecs = zip(flattened_masks, flattened_labels, flattened_logits)
        filtered_vecs = [(x, y, z) for x, y, z in zipped_vecs if x > 0]
        filtered_masks, filtered_labels, filtered_logits = zip(*filtered_vecs)
        
        tmp_eval_accuracy = flat_accuracy(filtered_logits, filtered_labels)
        
        actual_labels.extend(filtered_labels)
        predictions.extend(filtered_logits)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
        
    eval_loss = eval_loss/nb_eval_steps
    eval_accuracy = eval_accuracy/nb_eval_steps
    
    plot_val_loss = plot_val_loss + [eval_loss]
    plot_val_acc = plot_val_acc + [eval_accuracy]
    
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy))
    
    metrics = evaluate_metrics(actual_labels, predictions)
    loss_score = round(metrics['avglogloss'],4)
    accuracy_score = round(metrics['accuracy'],4)
    auc_score = round(metrics['auroc'], 4)
    f1_score = round(metrics['F1'], 4)
    
    print("Validation AvgLogLoss: {}".format(loss_score))
    print("Validation Accuracy: {}".format(accuracy_score))
    print("Validation AUC: {}".format(auc_score))
    print("Validation F1-Score: {}".format(f1_score))

    plot_val_auc = plot_val_auc + [auc_score]
    plot_val_f1 = plot_val_f1 + [f1_score]
   

In [0]:
# Healthcheck: print layer 3

for param in model.distilbert.transformer.layer[3].named_parameters():
    print(param)

In [0]:
# Healthcheck: print layer 5

for param in model.distilbert.transformer.layer[5].named_parameters():
    print(param)

In [0]:
fig = plt.figure(figsize=(10, 7))

# "Loss"
plt.subplot(2, 2, 1)
plt.plot(plot_train_loss)
plt.plot(plot_val_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.yticks(np.arange(math.floor(min(plot_train_loss) * 10)/10.0, math.ceil(max(plot_val_loss)* 10)/10.0, 0.05))

#  "Accuracy"
plt.subplot(2, 2, 2)
plt.plot(plot_val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['validation'], loc='upper left')

#  "AUC"
plt.subplot(2, 2, 3)
plt.plot(plot_val_auc)
plt.title('AUC score')
plt.ylabel('score')
plt.xlabel('epoch')
plt.legend(['AUC'], loc='upper left')


plt.subplot(2, 2, 4)
plt.plot(plot_val_f1)
plt.title('F1-score')
plt.ylabel('score')
plt.xlabel('epoch')
plt.legend(['F1-score'], loc='upper left')
fig.tight_layout()
plt.show()

In [0]:
# Training model again (stack and finetune)

LEARNING_RATE = 1e-6
NUM_EPOCHS = 30

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
]
    
optimizer = Adam(optimizer_grouped_parameters, lr=LEARNING_RATE)

In [0]:
model.cuda();

epochs = NUM_EPOCHS
max_grad_norm = 1.0

plot_train_loss = []
plot_train_acc = []
plot_val_loss = []
plot_val_acc = []
plot_val_auc = []
plot_val_f1 = []

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, b_input_mask, b_labels = batch

        # forward pass
        loss = model(b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days.float(), b_country, b_client, b_session, b_time.float(), token_type_ids=None, # .float() to fix error message "float expected, got double"
                     attention_mask=b_input_mask, labels=b_labels)
        
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    plot_train_loss = plot_train_loss + [tr_loss/nb_tr_steps]
    
    
    # VALIDATION on validation set
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels, actual_labels = [], [], []
    
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, b_unique_exids, b_token, b_userids, b_format, b_days, b_country, b_client, b_session, b_time, token_type_ids=None,
                           attention_mask=b_input_mask)
            
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        input_mask = b_input_mask.to('cpu').numpy()
    
        true_labels.append(label_ids)
        
        flattened_masks = flatten(input_mask)
        flattened_labels = flatten(label_ids)
        flattened_logits = flatten(flatten(logits))

        zipped_vecs = zip(flattened_masks, flattened_labels, flattened_logits)
        filtered_vecs = [(x, y, z) for x, y, z in zipped_vecs if x > 0]
        filtered_masks, filtered_labels, filtered_logits = zip(*filtered_vecs)
        
        tmp_eval_accuracy = flat_accuracy(filtered_logits, filtered_labels)
        
        actual_labels.extend(filtered_labels)
        predictions.extend(filtered_logits)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
        
    eval_loss = eval_loss/nb_eval_steps
    eval_accuracy = eval_accuracy/nb_eval_steps
    
    plot_val_loss = plot_val_loss + [eval_loss]
    plot_val_acc = plot_val_acc + [eval_accuracy]
    
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy))
    
    
    metrics = evaluate_metrics(actual_labels, predictions)
    loss_score = round(metrics['avglogloss'],4)
    accuracy_score = round(metrics['accuracy'],4)
    auc_score = round(metrics['auroc'], 4)
    f1_score = round(metrics['F1'], 4)
    
    print("Validation AvgLogLoss: {}".format(loss_score))
    print("Validation Accuracy: {}".format(accuracy_score))
    print("Validation AUC: {}".format(auc_score))
    print("Validation F1-Score: {}".format(f1_score))

    plot_val_auc = plot_val_auc + [auc_score]
    plot_val_f1 = plot_val_f1 + [f1_score]
   

In [0]:
fig = plt.figure(figsize=(10, 7))

# "Loss"
plt.subplot(2, 2, 1)
plt.plot(plot_train_loss)
plt.plot(plot_val_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.yticks(np.arange(math.floor(min(plot_train_loss) * 10)/10.0, math.ceil(max(plot_val_loss)* 10)/10.0, 0.05))

#  "Accuracy"
plt.subplot(2, 2, 2)
plt.plot(plot_val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['validation'], loc='upper left')

#  "AUC"
plt.subplot(2, 2, 3)
plt.plot(plot_val_auc)
plt.title('AUC score')
plt.ylabel('score')
plt.xlabel('epoch')
plt.legend(['AUC'], loc='upper left')


plt.subplot(2, 2, 4)
plt.plot(plot_val_f1)
plt.title('F1-score')
plt.ylabel('score')
plt.xlabel('epoch')
plt.legend(['F1-score'], loc='upper left')
fig.tight_layout()
plt.show()

In [0]:
def compute_avg_log_loss(actual, predicted):
    """
    Computes the average log loss of your predictions.
    """
    num = len(actual)
    loss = 0.

    for i in range(num):
        p = predicted[i] if actual[i] > .5 else 1. - predicted[i]
        print(str(p) + str(predicted[i]) + str(actual[i]))
        loss -= math.log(p)
    loss /= num
    return loss

In [0]:
compute_avg_log_loss(actual_labels, predictions)

## Analysis of results

In [0]:
for i in range(len(predictions)):
  print(true_labels[0][i])
  print(predictions[i])

In [0]:
for i in range(len(label_ids)):
  if 0 in label_ids[i]:
      first_zero = np.where(label_ids[i] == 0)[0][0]
      if 2 in np.argmax(logits, axis=2)[i][:first_zero]:
          print(np.argmax(logits, axis=2)[i])
          print(label_ids[i])
          print(" ")
  else:
      if 2 in np.argmax(logits, axis=2)[i][:first_zero]:
          print(np.argmax(logits, axis=2)[i])
          print(label_ids[i])
          print(" ")

In [0]:
model.to('cpu')

after_embedding_userid = model.embedding_userID.weight.clone()
after_embedding_format = model.embedding_format.weight.clone()
after_embedding_token = model.embedding_token.weight.clone()
after_embedding_bert_gru = model.bert_gru.all_weights

In [0]:
# Distribution of actual errors

n_words = 11
results = [0]*n_words

for row in labels_test:
  row = row + [0]*(n_words-len(row))
  results = [sum(x) for x in zip(results, row)]

print(results)
print(sum(results))

results = results[1:]

def plot_bar_x():
    index = np.arange(len(results))+1
    plt.bar(index, results)
    plt.xlabel('Position in sentence', fontsize=10)
    plt.ylabel('# of actual mistakes', fontsize=10)
    plt.xticks(index, fontsize=10, rotation=30)
    plt.show()
    
plot_bar_x()

In [0]:
# Number of sentences vs. Number of tokens

n_sentences_per_length = [0]*20

for row in labels_test:
  n_sentences_per_length[len(row)-2-1] = n_sentences_per_length[len(row)-2-1] + 1 #-2 because of the start and end tags, -1 because of zero-indexing in python

n_sentences_per_length = n_sentences_per_length[:10]

print(n_sentences_per_length)

def plot_bar_x():
    index = np.arange(len(n_sentences_per_length))+1
    plt.bar(index, n_sentences_per_length)
    plt.xlabel('Sentence length', fontsize=10)
    plt.ylabel('# of sentences', fontsize=10)
    plt.xticks(index, fontsize=10, rotation=30)
    plt.show()
    
plot_bar_x()

In [0]:
n_tokens_per_position = [0]*10

for i, x in enumerate(n_sentences_per_length):
  n_tokens_per_position[i] = sum(n_sentences_per_length[i:])

print(n_tokens_per_position)
print(results)

perc_actual_mistakes = [np.round(results[i]/n_tokens_per_position[i],3) for i, x in enumerate(results)]

print(perc_actual_mistakes)

In [0]:
# Distribution of actual errors (%)

def plot_bar_x():
    index = np.arange(len(perc_actual_mistakes))+1
    plt.bar(index, perc_actual_mistakes)
    plt.xlabel('Position in sentence', fontsize=10)
    plt.ylabel('% of actual mistakes', fontsize=10)
    plt.xticks(index, fontsize=10, rotation=30)
    plt.yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7], fontsize=10)
    plt.show()
    
plot_bar_x()

In [0]:
predictions_one_zero = [1 if x>=0.5 else 0 for x in predictions]
predictions_backup = predictions

In [0]:
# Transform 

predictions_vec = predictions_one_zero

predictions_list = [None]*len(labels_test)
previous_idx = 0

for i, row in enumerate(labels_test):
  idx = previous_idx + len(row) - 2
  predictions_list[i] = predictions_vec[previous_idx:idx]
  previous_idx = idx


In [0]:
# Distribution of predicted errors

n_words = 10
results = [0]*n_words

for row in predictions_list:
  row = row + [0]*(n_words-len(row))
  results = [sum(x) for x in zip(results, row)]

print(results)
print(sum(results))

def plot_bar_x():
    index = np.arange(len(results))+1
    plt.bar(index, results)
    plt.xlabel('Position in sentence', fontsize=10)
    plt.ylabel('# of predicted mistakes', fontsize=10)
    plt.xticks(index, fontsize=10, rotation=30)
    plt.show()
    
plot_bar_x()

In [0]:
perc_predicted_mistakes = [np.round(results[i]/n_tokens_per_position[i],3) for i, x in enumerate(results)]

In [0]:
# Distribution of predicted errors (%)

def plot_bar_x():
    index = np.arange(len(perc_predicted_mistakes))+1
    plt.bar(index, perc_predicted_mistakes)
    plt.xlabel('Position in sentence', fontsize=10)
    plt.ylabel('% of predicted mistakes', fontsize=10)
    plt.xticks(index, fontsize=10, rotation=30)
    plt.yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7], fontsize=10)
    plt.show()
    
plot_bar_x()

### Error analysis

In [0]:
labels_test_no_first_last = [vec[1:-1] for vec in labels_test]

In [0]:
correct_predictions = [0]*15
wrong_predictions = [0]*15
number_of_predictions = [0]*15

for i, row in enumerate(predictions_list):
  for j, prediction in enumerate(row):
    number_of_predictions[j] = number_of_predictions[j] + 1
    if prediction == labels_test_no_first_last[i][j]:
      correct_predictions[j] = correct_predictions[j] + 1
    elif prediction != labels_test_no_first_last[i][j]:
      wrong_predictions[j] = wrong_predictions[j] + 1
    else:
      break

print(correct_predictions[0:10])
print(number_of_predictions[0:10])
print(wrong_predictions[0:10])
print(sum(correct_predictions) + sum(wrong_predictions) == sum(number_of_predictions))

model_errors = [np.round(pred/number_of_predictions[i],3) for i, pred in enumerate(wrong_predictions[0:10])]
print(model_errors)

In [0]:
# Distribution of model prediction errors (%)

def plot_bar_x():
    index = np.arange(len(model_errors))+1
    plt.bar(index, model_errors)
    plt.xlabel('Position in sentence', fontsize=10)
    plt.ylabel('% of predictions wrong', fontsize=10)
    plt.xticks(index, fontsize=10, rotation=30)
    plt.yticks([0.0,0.1,0.2,0.3], fontsize=10)
    plt.show()
    
plot_bar_x()

In [0]:
true_positives = [0]*15
false_positives = [0]*15
true_negatives = [0]*15
false_negatives = [0]*15
number_of_predictions = [0]*15

for i, row in enumerate(predictions_list):
  for j, prediction in enumerate(row):
    number_of_predictions[j] = number_of_predictions[j] + 1

    if (prediction == 1) & (labels_test_no_first_last[i][j] == 1):
      true_positives[j] = true_positives[j] + 1

    elif (prediction == 1) & (labels_test_no_first_last[i][j] == 0):
      false_positives[j] = false_positives[j] + 1

    elif (prediction == 0) & (labels_test_no_first_last[i][j] == 1):
      false_negatives[j] = false_negatives[j] + 1

    elif (prediction == 0) & (labels_test_no_first_last[i][j] == 0):
      true_negatives[j] = true_negatives[j] + 1

    else:
      break

print(true_positives[0:10])
print(false_positives[0:10])
print(true_negatives[0:10])
print(false_negatives[0:10])

In [0]:
# Distribution of model prediction errors

def plot_bar_x():
    fig, ax = plt.subplots(ncols=2, figsize=(15,4),)

    index = np.arange(len(true_positives[0:10]))+1
    ax[0].bar(index - 0.25, true_positives[0:10], width = 0.2, color = "green", label = "True Positives")
    ax[0].bar(index + 0.00, false_positives[0:10], width = 0.2, label = "False Positives")
    ax[0].bar(index + 0.25, false_negatives[0:10], width = 0.2, label = "False Negatives")
    ax[0].legend()
    ax[0].set_xlabel('Position in sentence', fontsize=10)
    ax[0].set_ylabel('% of occurrences', fontsize=10)
    ax[0].set_xticks(index)

    ax[1].bar(index, true_negatives[0:10], width = 0.2, color = "red", label = "True Negatives")
    ax[1].legend()
    ax[1].set_xlabel('Position in sentence', fontsize=10)
    ax[1].set_ylabel('% of occurrences', fontsize=10)
    ax[1].set_xticks(index)


    fig.show()

    
    
plot_bar_x()

In [0]:
perc_true_positives = [np.round(x/number_of_predictions[i],3) for i, x in enumerate(true_positives[0:10])]
perc_false_positives = [np.round(x/number_of_predictions[i],3) for i, x in enumerate(false_positives[0:10])]
perc_true_negatives = [np.round(x/number_of_predictions[i],3) for i, x in enumerate(true_negatives[0:10])]
perc_false_negatives = [np.round(x/number_of_predictions[i],3) for i, x in enumerate(false_negatives[0:10])]

In [0]:
# Distribution of model prediction errors (%)

def plot_bar_x():
    fig, ax = plt.subplots(ncols=2, figsize=(15,4),)

    index = np.arange(len(true_positives[0:10]))+1
    ax[0].bar(index - 0.25, perc_true_positives[0:10], width = 0.2, color = "green", label = "True Positives")
    ax[0].bar(index + 0.00, perc_false_positives[0:10], width = 0.2, label = "False Positives")
    ax[0].bar(index + 0.25, perc_false_negatives[0:10], width = 0.2, label = "False Negatives")
    ax[0].legend()
    ax[0].set_xlabel('Position in sentence', fontsize=10)
    ax[0].set_ylabel('% of occurrences', fontsize=10)
    ax[0].set_xticks(index)

    ax[1].bar(index, perc_true_negatives[0:10], width = 0.2, color = "red", label = "True Negatives")
    ax[1].legend()
    ax[1].set_xlabel('Position in sentence', fontsize=10)
    ax[1].set_ylabel('% of occurrences', fontsize=10)
    ax[1].set_xticks(index)


    fig.show()

    
    
plot_bar_x()