In this notebook, I propose a more sophisticated model that achieves better validation F1 score (see end of the notebook)

Improvements include:

- advanced text processing and cleaning
- a custom tokenizer based on Spacy and its english language model
- a FastText binary model to initialize an embedding matrix from character-ngrams (i.e. no out-of-vocabulary words)
- use of a stacked bi-LSTM architecture

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi=False

In [7]:
import re

# pandas and numpy for dataframes and array manipulations
# tqdm as a progress
# matplotlib for plotting

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

from matplotlib import pyplot as plt

# usual PyTorch imports for tensor manipulations, neural networks and data processings
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# import some sklearn utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# import keras tokenizing utilities 
from keras.preprocessing import text, sequence

# import tensorboardX in case we want to log metrics to tensorboard (requires tensorflow installed - optional)
from tensorboardX import SummaryWriter

from graphviz import Digraph
from torchviz import make_dot


# import spacy for tokenization
import spacy

# fastText is a library for efficient learning of word representations and sentence classification
# https://github.com/facebookresearch/fastText/tree/master/python
# I use it with a pre-trained english embedding that you can fetch from the official website
import fastText

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [3]:
# load english spacy model and disable ner, parser, tagger to make it faster

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])

In [26]:
# we load train and test data and separate the target in another variable

X_train = pd.read_csv('./data/train.csv')
X_test = pd.read_csv('./data/test.csv')
Y = X_train['target'].values

we start by preprocessing the text: 

In [27]:
# decontract

def decontract(text):
    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)
    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)isn(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)
    return text

X_train['question_text'] = X_train['question_text'].progress_map(lambda q: decontract(q))
X_test['question_text'] = X_test['question_text'].progress_map(lambda q: decontract(q)) 

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=375806), HTML(value='')))

In [29]:
# clean apostrophes

def clean_apostrophes(x):
    apostrophes = ["’", "‘", "´", "`"]
    for s in apostrophes:
        x = re.sub(s, "'", x)
    return x


X_train['question_text'] = X_train['question_text'].progress_map(lambda q: clean_apostrophes(q))
X_test['question_text'] = X_test['question_text'].progress_map(lambda q: clean_apostrophes(q)) 

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=375806), HTML(value='')))

In [32]:
# clean weird / special characters

letter_mapping = {'\u200b':' ', 'ũ': "u", 'ẽ': 'e', 'é': "e", 'á': "a", 'ķ': 'k', 'ï': 'i', 'Ź': 'Z', 'Ż': 'Z', 'Š': 'S', 'Π': ' pi ', 'Ö': 'O', 'É': 'E', 'Ñ': 'N', 'Ž': 'Z', 'ệ': 'e', '²': '2', 'Å': 'A', 'Ā': 'A', 'ế': 'e', 'ễ': 'e', 'ộ': 'o', '⧼': '<', '⧽': '>', 'Ü': 'U', 'Δ': 'delta', 'ợ': 'o', 'İ': 'I', 'Я': 'R', 'О': 'O', 'Č': 'C', 'П': 'pi', 'В': 'B', 'Φ': 'phi', 'ỵ': 'y', 'օ': 'o', 'Ľ': 'L', 'ả': 'a', 'Γ': 'theta', 'Ó': 'O', 'Í': 'I', 'ấ': 'a', 'ụ': 'u', 'Ō': 'O', 'Ο': 'O', 'Σ': 'sigma', 'Â': 'A', 'Ã': 'A', 'ᗯ': 'w', 'ᕼ': "h", "ᗩ": "a", "ᖇ": "r", "ᗯ": "w", "O": "o", "ᗰ": "m", "ᑎ": "n", "ᐯ": "v", "н": "h", "м": "m", "o": "o", "т": "t", "в": "b", "υ": "u",  "ι": "i","н": "h", "č": "c", "š": "s", "ḥ": "h", "ā": "a", "ī": "i", "à": "a", "ý": "y", "ò": "o", "è": "e", "ù": "u", "â": "a", "ğ": "g", "ó": "o", "ê": "e", "ạ": "a", "ü": "u", "ä": "a", "í": "i", "ō": "o", "ñ": "n", "ç": "c", "ã": "a", "ć": "c", "ô": "o", "с": "c", "ě": "e", "æ": "ae", "î": "i", "ő": "o", "å": "a", "Ä": "A", } 

def clean_special_chars(text):
    new_text = ''
    for i in range(len(text)):
        if i in letter_mapping:
            c = letter_mapping[i]
        else:
            c = text[i]
        new_text += c
    return new_text


X_train['question_text'] = X_train['question_text'].progress_map(lambda q: clean_special_chars(q))
X_test['question_text'] = X_test['question_text'].progress_map(lambda q: clean_special_chars(q)) 

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=375806), HTML(value='')))

In [41]:
# remove useless punctuations

useless_punct = ['च', '不', 'ঢ়', '平', 'ᠠ', '錯', '判', '∙', '言', 'ς', 'ل', '្', 'ジ', 'あ', '得', '水', 'ь', '◦', '创', '康', '華', 'ḵ', '☺', '支', '就', '„', '」', '어', '谈', '陈', '团', '腻', '权', '年', '业', 'マ', 'य', 'ا', '売', '甲', '拼', '˂', 'ὤ', '贯', '亚', 'ि', '放', 'ʻ', 'ទ', 'ʖ', '點', '્', '発', '青', '能', '木', 'д', '微', '藤', '̃', '僕', '妒', '͜', 'ន', 'ध', '이', '希', '特', 'ड', '¢', '滢', 'ส', '나', '女', 'క', '没', '什', 'з', '天', '南', 'ʿ', 'ค', 'も', '凰', '步', '籍', '西', 'ำ', '−', 'л', 'ڤ', 'ៃ', '號', 'ص', 'स', '®', 'ʋ', '批', 'រ', '치', '谢', '生', '道', '═', '下', '俄', 'ɖ', '觀', 'வ', '—', 'ی', '您', '♥', '一', 'や', '⊆', 'ʌ', '語', 'ี', '兴', '惶', '瀛', '狐', '⁴', 'प', '臣', 'ద', '―', 'ì', 'ऌ', 'ీ', '自', '信', '健', '受', 'ɨ', '시', 'י', 'ছ', '嬛', '湾', '吃', 'ち', 'ड़', '反', '红', '有', '配', 'ে', 'ឯ', '宮', 'つ', 'μ', '記', '口', '℅ι', 'ो', '狸', '奇', 'о', 'ट', '聖', '蘭', '読', 'ū', '標', '要', 'ត', '识', 'で', '汤', 'ま', 'ʀ', '局', 'リ', '्', 'ไ', '呢', '工', 'ल', '沒', 'τ', 'ិ', 'ö', 'せ', '你', 'ん', 'ュ', '枚', '部', '大', '罗', 'হ', 'て', '表', '报', '攻', 'ĺ', 'ฉ', '∩', '宝', '对', '字', '文', '这', '∑', '髪', 'り', '่', '능', '罢', '내', '阻', '为', '菲', 'ي', 'न', 'ί', 'ɦ', '開', '†', '茹', '做', '東', 'ত', 'に', 'ت', '晓', '키', '悲', 'સ', '好', '›', '上', '存', '없', '하', '知', 'ធ', '斯', ' ', '授', 'ł', '傳', '兰', '封', 'ோ', 'و', 'х', 'だ', '人', '太', '品', '毒', 'ᡳ', '血', '席', '剔', 'п', '蛋', '王', '那', '梦', 'ី', '彩', '甄', 'и', '柏', 'ਨ', '和', '坊', '⌚', '广', '依', '∫', 'į', '故', 'ś', 'ऊ', '几', '日', 'ک', '音', '×', '”', '▾', 'ʊ', 'ज', 'ด', 'ठ', 'उ', 'る', '清', 'ग', 'ط', 'δ', 'ʏ', '官', '∛', '়', '้', '男', '骂', '复', '∂', 'ー', '过', 'য', '以', '短', '翻', 'র', '教', '儀', 'ɛ', '‹', 'へ', '¾', '合', '学', 'ٌ', '학', '挑', 'ष', '比', '体', 'م', 'س', 'អ', 'ת', '訓', '∀', '迎', 'វ', 'ɔ', '٨', '▒', '化', 'చ', '‛', 'প', 'º', 'น', '업', '说', 'ご', '¸', '₹', '儿', '︠', '게', '骨', 'ท', 'ऋ', 'ホ', '茶', '는', 'જ', 'ุ', '羡', '節', 'ਮ', 'উ', '番', 'ড়', '讲', 'ㅜ', '등', '伟', 'จ', '我', 'ล', 'す', 'い', 'ញ', '看', 'ċ', '∧', 'भ', 'ઘ', 'ั', 'ម', '街', 'ય', '还', '鰹', 'ខ', 'ు', '訊', 'म', 'ю', '復', '杨', 'ق', 'त', '金', '味', 'ব', '风', '意', '몇', '佬', '爾', '精', '¶', 'ం', '乱', 'χ', '교', 'ה', '始', 'ᠰ', '了', '个', '克', '্', 'ห', '已', 'ʃ', 'わ', '新', '译', '︡', '本', 'ง', 'б', 'け', 'ి', '明', '¯', '過', 'ك', 'ῥ', 'ف', 'ß', '서', '进', 'ដ', '样', '乐', '寧', '€', 'ณ', 'ル', '乡', '子', 'ﬁ', 'ج', '慕', '–', 'ᡵ', 'Ø', '͡', '제', 'Ω', 'ប', '絕', '눈', 'फ', 'ম', 'గ', '他', 'α', 'ξ', '§', 'ஜ', '黎', 'ね', '복', 'π', 'ú', '鸡', '话', '会', 'ক', '八', '之', '북', 'ن', '¦', '가', 'ו', '恋', '地', 'ῆ', '許', '产', 'ॡ', 'ش', '़', '野', 'ή', 'ɒ', '啧', 'យ', '᠌', 'ᠨ', 'ب', '皎', '老', '公', '☆', 'व', 'ি', 'ល', 'ر', 'គ', '행', 'ង', 'ο', '让', 'ំ', 'λ', 'خ', 'ἰ', '家', 'ট', 'ब', '理', '是', 'め', 'र', '√', '기', 'ν', '玉', '한', '入', 'ד', '别', 'د', 'ะ', '电', 'ા', '♫', 'ع', 'ં', '堵', '嫉', '伊', 'う', '千', '관', '篇', 'क', '非', '荣', '粵', '瑜', '英', '를', '美', '条', '`', '宋', '←', '수', '後', '•', '³', 'ी', '고', '肉', '℃', 'し', '漢', '싱', 'ϵ', '送', 'ه', '落', 'న', 'ក', 'க', 'ℇ', 'た', 'ះ', '中', '射', '♪', '符', 'ឃ', '谷', '分', '酱', 'び', 'থ', 'ة', 'г', 'σ', 'と', '楚', '胡', '饭', 'み', '禮', '主', '直', '÷', '夢', 'ɾ', 'চ', '⃗', '統', '高', '顺', '据', 'ら', '頭', 'よ', '最', 'ా', 'ੁ', '亲', 'ស', '花', '≡', '眼', '病', '…', 'の', '發', 'ா', '汝', '★', '氏', 'ร', '景', 'ᡠ', '读', '件', '仲', 'শ', 'お', 'っ', 'پ', 'ᡤ', 'ч', '♭', '悠', 'ं', '六', '也', 'ռ', 'য়', '恐', 'ह', '可', '啊', '莫', '书', '总', 'ষ', 'ք', '̂', '간', 'な', '此', '愛', 'ర', 'ใ', '陳', 'Ἀ', 'ण', '望', 'द', '请', '油', '露', '니', 'ş', '宗', 'ʍ', '鳳', 'अ', '邋', '的', 'ព', '火', 'ा', 'ก', '約', 'ட', '章', '長', '商', '台', '勢', 'さ', '국', 'Î', '簡', 'ई', '∈', 'ṭ', '經', '族', 'ु', '孫', '身', '坑', 'স', '么', 'ε', '失', '殺', 'ž', 'ર', 'が', '手', 'ា', '心', 'ਾ', '로', '朝', '们', '黒', '欢', '早', '️', 'া', 'आ', 'ɸ', '常', '快', '民', 'ﷺ', 'ូ', '遢', 'η', '国', '无', '江', 'ॠ', '「', 'ন', '™', 'ើ', 'ζ', '紫', 'ె', 'я', '“', '♨', '國', 'े', 'อ', '∞']
useless_punct.remove(' ')

def remove_useless_punct(text):
    return re.sub(f'{"|".join(useless_punct)}', '', text)


X_train['question_text'] = X_train['question_text'].progress_map(lambda q: remove_useless_punct(q))
X_test['question_text'] = X_test['question_text'].progress_map(lambda q: remove_useless_punct(q)) 

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=375806), HTML(value='')))

In [72]:
# define tokenization parameters 

MAX_WORDS = 100000
MAX_LEN = 70

In [44]:
# tokenize train and test sequences and build word index

all_questions = X_train['question_text'].tolist() + X_test['question_text'].tolist()

word_index = {}
x_train = []
x_test = []

c = 0
for doc in tqdm_notebook(nlp.pipe(all_questions, n_threads=10), total=len(all_questions)):
    
    tokens = [token.text.lower() for token in doc]

    for token in tokens:
        if token not in word_index:
            word_index[token] = 1
        else:
            word_index[token] += 1

    if c < X_train.shape[0]:
        x_train.append(tokens)
    else:
        x_test.append(tokens)
        
    c += 1

word_index = dict(sorted(zip(word_index.keys(), word_index.values()), key=lambda c: c[1], reverse=True))
for i, word in enumerate(word_index):
    word_index[word] = i + 1

In [73]:
# convert train and test word sequences to sequences of indexes while ignoring unfrequent words

def convert_to_indexes(sequence, word_index, max_words=MAX_WORDS):
    output = []
    for token in sequence:
        if (token in word_index) and (word_index[token] < max_words): 
            output.append(word_index[token])
    return output


x_train_idx = [convert_to_indexes(seq, word_index) for seq in tqdm_notebook(x_train, leave=False)]
x_test_idx = [convert_to_indexes(seq, word_index) for seq in tqdm_notebook(x_test, leave=False)]

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=375806), HTML(value='')))

In [74]:
# pad train and test sequences

x_train_padded = sequence.pad_sequences(tqdm_notebook(x_train_idx, leave=False), maxlen=MAX_LEN)
x_test_padded = sequence.pad_sequences(tqdm_notebook(x_test_idx, leave=False), maxlen=MAX_LEN)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=375806), HTML(value='')))

In [56]:
embedding_type = 'fasttext'

if embedding_type == 'glove':

    def load_embedding(path):
        embedding = {}
        with open(path, 'r') as f:
            for line in tqdm_notebook(f.readlines()):
                k = line.split(' ')[0]
                v = line.split(' ')[1:]
                embedding[k] = np.array(v, dtype=float)
        return embedding

    glove = load_embedding('./embeddings/glove.840B.300d/glove.840B.300d.txt')

    mean_emb = -0.005838493338505765
    std_emb = 0.48782081729236354
    
else:
    fasttext_model = fastText.load_model('/data_science/nlp/embeddings/fasttext/wiki.en.bin')

In [58]:
def build_embedding_matrix_glove(word_index, embedding, embed_size=300):
    matrix = np.zeros((min(MAX_WORDS, len(word_index) + 1), embed_size))
    oov = []
    for word, i in tqdm_notebook(word_index.items(), leave=False):
        if word in embedding:
            try:
                matrix[i, :] = embedding[word]
            except IndexError as e:
                break
        else:
            try:
                matrix[i, :] = np.random.normal(mean_emb, std_emb, (embed_size))
            except IndexError as e:
                break
            oov.append(word)
    return matrix, oov

def build_embedding_matrix_with_fasttext(word_index, model, embed_size=300):
    matrix = np.zeros((min(MAX_WORDS, len(word_index) + 1), embed_size))
    for word, i in tqdm_notebook(word_index.items(), leave=False):
        try:
            matrix[i, :] = fasttext_model.get_word_vector(word)
        except IndexError as e:
            break
            
    return matrix

In [75]:
if embedding_type == 'glove':
    embedding_matrix, oov = build_embedding_matrix_glove(word_index, glove)
    
elif embedding_type == 'fasttext':
    embedding_matrix = build_embedding_matrix_with_fasttext(word_index, fasttext_model)

HBox(children=(IntProgress(value=0, max=250842), HTML(value='')))

In [77]:
embedding_matrix.shape

(100000, 300)

Now we build our Neural Net

In [87]:
class Model(nn.Module):
    def __init__(self, embedding_matrix):
        super(Model, self).__init__()
        vocab_size = embedding_matrix.shape[0]
        embedding_dim = embedding_matrix.shape[1]
        
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_layer.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding_layer.weight.requires_grad = True
        
        self.hidden_units = 64
        
        self.lstm1 = nn.LSTM(embedding_dim, self.hidden_units, bidirectional=True)
        self.lstm2 = nn.LSTM(self.hidden_units * 2, self.hidden_units, bidirectional=True)
        
        self.fc1 = nn.Linear(self.hidden_units * 2 * 2, 1)
    
    def forward(self, x):
        x = self.embedding_layer(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
    
        # global average pooling
        h_avg = torch.mean(x, 1)
        
        # global max pooling
        h_max = torch.max(x, 1)[0]
        
        # concat the two pooling results
        h_concat = torch.cat((h_avg, h_max), 1)
        
        # pass to a fully conncected layer
        output = self.fc1(h_concat)
        return output

We define a dataloader

In [88]:
class MyDataset(Dataset):
    # it takes two parameters in the constructor
    # padded_sequencs: the sequences previously processed with padding
    # labels: the corresponding targets
    def __init__(self, padded_sequences, labels):
        self.padded_sequences = padded_sequences
        self.labels = labels
    
    
    # special method to return the length of the dataset
    def __len__(self):
        return len(self.padded_sequences)
    
    # special method to get any given item at a given index while applying some preprocessings on it
    def __getitem__(self, index):
        # get a padded sequence
        x = self.padded_sequences[index, :]
        
        # convert it to long tensor (because the values are actually indexes of mapping)
        x = torch.LongTensor(x)  
        
        # convert the target to a float tensor
        y = self.labels[index]
        y = torch.Tensor([y])
        return x, y

I define two functions I usually use in the main training loop:

- compute_f1_score: takes a input y: the batch of labels, preds: the corresponding outputs of the models and computes the f1_score

- print_metrics: prints at any given iteration, the current average loss and f1 score

In [89]:
def compute_f1_score(y, preds, th=0.5):
    y_preds_proba = torch.sigmoid(preds)
    y_preds_proba = y_preds_proba.cpu().detach().numpy()
    y_pred = (y_preds_proba > th).astype(int)
    
    y_true = y.detach().cpu().numpy()
    f1 = f1_score(y_true=y_true, y_pred=y_pred)
    return f1

def print_metrics(iteration, total_iterations, epoch, total_epochs, loss_list, f1_list, print_every, train=True):
    if train:
        msg = f'Epoch: {epoch+1} / {total_epochs} |Iteration: {iteration} / {total_iterations} \n'
        msg += f'Average train loss: {np.mean(loss_list)} | Average train f1: {np.mean(f1_list)}'
        
    else:
        msg = f'Epoch: {epoch+1} / {total_epochs} |Iteration: {iteration} / {total_iterations} \n'
        msg += f'Average val loss: {np.mean(loss_list)} | Average val f1: {np.mean(f1_list)}'
        
    if iteration % print_every == 0:
        print(msg)

In [90]:
x_TRAIN, x_VAL, y_TRAIN, y_VAL = train_test_split(x_train_padded, Y, test_size=0.2, random_state=42)

# We create a train and validation datasets

In [91]:
train_dataset = MyDataset(x_TRAIN, y_TRAIN)
val_dataset = MyDataset(x_VAL, y_VAL)

In [92]:
batch_size = 256

train_loader = DataLoader(train_dataset, num_workers=10, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, num_workers=5, batch_size=batch_size)

In [93]:
model = Model(embedding_matrix)

model = model.cuda()

optimizer = torch.optim.Adam(lr=0.003, params=model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [94]:
num_epochs = 5
print_every_train = 500
print_every_val = 150


for epoch in tqdm_notebook(range(num_epochs)):
    
    # train model
    
    f1_scores_train = []
    losses_train = []

    model.train()
    for i, (x, y) in enumerate(tqdm_notebook(train_loader, leave=False, total=len(train_loader))):
        
        # pass batches to GPU
        x = x.cuda()
        y = y.cuda()
        
        # clear gradients
        optimizer.zero_grad()
        
        # compute predictions: forward pass on the batch
        preds = model.forward(x)
        
        # compute the loss on the batch
        loss = criterion(preds, y)
        
        # compute the gardients by derivating the loss
        loss.backward()
        
        # update the weights
        optimizer.step()
        
        # keep track of losses and f1 scores for each epoch
        losses_train.append(loss.item())
        
        f1_score_train = compute_f1_score(y, preds)
        f1_scores_train.append(f1_score_train)
        
        # print metrics
        print_metrics(i, len(train_loader), epoch, num_epochs, losses_train, f1_scores_train, print_every_train)
        
    # evaluation mode
        
    f1_scores_val = []
    losses_val = []
    
    model.eval()
    for i, (x, y) in enumerate(tqdm_notebook(val_loader, leave=False, total=len(val_loader))):
        
        # pass batches to GPU
        x = x.cuda()
        y = y.cuda()
        
        # make predictions
        preds = model.forward(x)
        
        # computes the losses and f1 scores
        loss = criterion(y, preds)
        losses_val.append(loss.item())
        
        f1_score_val = compute_f1_score(y, preds)
        f1_scores_val.append(f1_score_val)
        
        # print metrics
        print_metrics(i, len(train_loader), epoch, num_epochs, losses_val, f1_scores_val, print_every_val, train=False)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4082), HTML(value='')))

  'precision', 'predicted', average, warn_for)


Epoch: 1 / 5 |Iteration: 0 / 4082 
Average train loss: 0.6757609248161316 | Average train f1: 0.0
Epoch: 1 / 5 |Iteration: 500 / 4082 
Average train loss: 0.15787995373923144 | Average train f1: 0.3066316496186525
Epoch: 1 / 5 |Iteration: 1000 / 4082 
Average train loss: 0.1416004403614319 | Average train f1: 0.3924587154070072
Epoch: 1 / 5 |Iteration: 1500 / 4082 
Average train loss: 0.1333219802787628 | Average train f1: 0.43986280368012537
Epoch: 1 / 5 |Iteration: 2000 / 4082 
Average train loss: 0.1292116065053032 | Average train f1: 0.4688155586934848
Epoch: 1 / 5 |Iteration: 2500 / 4082 
Average train loss: 0.1264233723646257 | Average train f1: 0.48741091205883674
Epoch: 1 / 5 |Iteration: 3000 / 4082 
Average train loss: 0.12423817004905427 | Average train f1: 0.5007370534041804
Epoch: 1 / 5 |Iteration: 3500 / 4082 
Average train loss: 0.12221043556183789 | Average train f1: 0.5128028685193958
Epoch: 1 / 5 |Iteration: 4000 / 4082 
Average train loss: 0.12103053184318799 | Averag

HBox(children=(IntProgress(value=0, max=1021), HTML(value='')))

Epoch: 1 / 5 |Iteration: 0 / 4082 
Average val loss: 0.7146368026733398 | Average val f1: 0.7027027027027027
Epoch: 1 / 5 |Iteration: 150 / 4082 
Average val loss: 0.7370871309413026 | Average val f1: 0.5879052335906325
Epoch: 1 / 5 |Iteration: 300 / 4082 
Average val loss: 0.737037448788006 | Average val f1: 0.5946041844971859
Epoch: 1 / 5 |Iteration: 450 / 4082 
Average val loss: 0.7381745005922677 | Average val f1: 0.5954202538687579
Epoch: 1 / 5 |Iteration: 600 / 4082 
Average val loss: 0.7381020564207816 | Average val f1: 0.5970251876450733
Epoch: 1 / 5 |Iteration: 750 / 4082 
Average val loss: 0.7386753028147072 | Average val f1: 0.5984694933433252
Epoch: 1 / 5 |Iteration: 900 / 4082 
Average val loss: 0.739078609382405 | Average val f1: 0.5982427027903153


HBox(children=(IntProgress(value=0, max=4082), HTML(value='')))

Epoch: 2 / 5 |Iteration: 0 / 4082 
Average train loss: 0.10370098054409027 | Average train f1: 0.6896551724137931
Epoch: 2 / 5 |Iteration: 500 / 4082 
Average train loss: 0.09964284382596938 | Average train f1: 0.6304664008967932
Epoch: 2 / 5 |Iteration: 1000 / 4082 
Average train loss: 0.10104360485395471 | Average train f1: 0.6277714753125372
Epoch: 2 / 5 |Iteration: 1500 / 4082 
Average train loss: 0.10142815114060376 | Average train f1: 0.6252338245778297
Epoch: 2 / 5 |Iteration: 2000 / 4082 
Average train loss: 0.10140279483521121 | Average train f1: 0.62594545393653
Epoch: 2 / 5 |Iteration: 2500 / 4082 
Average train loss: 0.10137675670696135 | Average train f1: 0.6240606004091878
Epoch: 2 / 5 |Iteration: 3000 / 4082 
Average train loss: 0.101204032355827 | Average train f1: 0.6251050549437019
Epoch: 2 / 5 |Iteration: 3500 / 4082 
Average train loss: 0.10134746252605147 | Average train f1: 0.6255418975043548
Epoch: 2 / 5 |Iteration: 4000 / 4082 
Average train loss: 0.101107475638

HBox(children=(IntProgress(value=0, max=1021), HTML(value='')))

Epoch: 2 / 5 |Iteration: 0 / 4082 
Average val loss: 0.6757248044013977 | Average val f1: 0.7804878048780488
Epoch: 2 / 5 |Iteration: 150 / 4082 
Average val loss: 0.7119232881937595 | Average val f1: 0.6255444906716922
Epoch: 2 / 5 |Iteration: 300 / 4082 
Average val loss: 0.7118506506827978 | Average val f1: 0.6292593317854295
Epoch: 2 / 5 |Iteration: 450 / 4082 
Average val loss: 0.7131118125503185 | Average val f1: 0.6317255507908925
Epoch: 2 / 5 |Iteration: 600 / 4082 
Average val loss: 0.7130371877635379 | Average val f1: 0.6325364574564193
Epoch: 2 / 5 |Iteration: 750 / 4082 
Average val loss: 0.7135759479355082 | Average val f1: 0.631953271102875
Epoch: 2 / 5 |Iteration: 900 / 4082 
Average val loss: 0.7138687058638257 | Average val f1: 0.6312170755667168


HBox(children=(IntProgress(value=0, max=4082), HTML(value='')))

Epoch: 3 / 5 |Iteration: 0 / 4082 
Average train loss: 0.08977300673723221 | Average train f1: 0.75
Epoch: 3 / 5 |Iteration: 500 / 4082 
Average train loss: 0.09029924408523385 | Average train f1: 0.6769945379463668
Epoch: 3 / 5 |Iteration: 1000 / 4082 
Average train loss: 0.09073308476811641 | Average train f1: 0.6782732664774398
Epoch: 3 / 5 |Iteration: 1500 / 4082 
Average train loss: 0.09136935360456688 | Average train f1: 0.6759080023645861
Epoch: 3 / 5 |Iteration: 2000 / 4082 
Average train loss: 0.09189793464729096 | Average train f1: 0.6729967617068586
Epoch: 3 / 5 |Iteration: 2500 / 4082 
Average train loss: 0.09221101431001048 | Average train f1: 0.6697464731490493
Epoch: 3 / 5 |Iteration: 3000 / 4082 
Average train loss: 0.09275128697353933 | Average train f1: 0.6665690210367503
Epoch: 3 / 5 |Iteration: 3500 / 4082 
Average train loss: 0.09298093200381127 | Average train f1: 0.665323816469845
Epoch: 3 / 5 |Iteration: 4000 / 4082 
Average train loss: 0.09329256323282345 | Ave

HBox(children=(IntProgress(value=0, max=1021), HTML(value='')))

Epoch: 3 / 5 |Iteration: 0 / 4082 
Average val loss: 0.7284998297691345 | Average val f1: 0.6250000000000001
Epoch: 3 / 5 |Iteration: 150 / 4082 
Average val loss: 0.7495720110192204 | Average val f1: 0.559860164170099
Epoch: 3 / 5 |Iteration: 300 / 4082 
Average val loss: 0.7483534383219342 | Average val f1: 0.5633200636543532
Epoch: 3 / 5 |Iteration: 450 / 4082 
Average val loss: 0.7500426896131224 | Average val f1: 0.565583650260447
Epoch: 3 / 5 |Iteration: 600 / 4082 
Average val loss: 0.7496208790534744 | Average val f1: 0.5673715692800083
Epoch: 3 / 5 |Iteration: 750 / 4082 
Average val loss: 0.7499592608999158 | Average val f1: 0.5698167434269689
Epoch: 3 / 5 |Iteration: 900 / 4082 
Average val loss: 0.7503141975826216 | Average val f1: 0.5706872815571655


HBox(children=(IntProgress(value=0, max=4082), HTML(value='')))

Epoch: 4 / 5 |Iteration: 0 / 4082 
Average train loss: 0.08603644371032715 | Average train f1: 0.5
Epoch: 4 / 5 |Iteration: 500 / 4082 
Average train loss: 0.08593049907532638 | Average train f1: 0.6979684627607862
Epoch: 4 / 5 |Iteration: 1000 / 4082 
Average train loss: 0.08599962853453495 | Average train f1: 0.6996540570957894
Epoch: 4 / 5 |Iteration: 1500 / 4082 
Average train loss: 0.086021215210034 | Average train f1: 0.6984576175536356
Epoch: 4 / 5 |Iteration: 2000 / 4082 
Average train loss: 0.08605017085307869 | Average train f1: 0.6948429054506239
Epoch: 4 / 5 |Iteration: 2500 / 4082 
Average train loss: 0.08624610503162088 | Average train f1: 0.6932624751650064
Epoch: 4 / 5 |Iteration: 3000 / 4082 
Average train loss: 0.08671071162457884 | Average train f1: 0.691433510708195
Epoch: 4 / 5 |Iteration: 3500 / 4082 
Average train loss: 0.08748055732059108 | Average train f1: 0.6887314710816316
Epoch: 4 / 5 |Iteration: 4000 / 4082 
Average train loss: 0.08760272579952437 | Averag

HBox(children=(IntProgress(value=0, max=1021), HTML(value='')))

Epoch: 4 / 5 |Iteration: 0 / 4082 
Average val loss: 0.7127043604850769 | Average val f1: 0.7368421052631577
Epoch: 4 / 5 |Iteration: 150 / 4082 
Average val loss: 0.7430671094269152 | Average val f1: 0.5633952284581831
Epoch: 4 / 5 |Iteration: 300 / 4082 
Average val loss: 0.7425184988500272 | Average val f1: 0.57068513921754
Epoch: 4 / 5 |Iteration: 450 / 4082 
Average val loss: 0.7442796235602606 | Average val f1: 0.5753951705845506
Epoch: 4 / 5 |Iteration: 600 / 4082 
Average val loss: 0.7439240895571209 | Average val f1: 0.576377032711644
Epoch: 4 / 5 |Iteration: 750 / 4082 
Average val loss: 0.7442886838106595 | Average val f1: 0.578875160532157
Epoch: 4 / 5 |Iteration: 900 / 4082 
Average val loss: 0.7446724934530311 | Average val f1: 0.5806434352219765


HBox(children=(IntProgress(value=0, max=4082), HTML(value='')))

Epoch: 5 / 5 |Iteration: 0 / 4082 
Average train loss: 0.07943730056285858 | Average train f1: 0.6206896551724137
Epoch: 5 / 5 |Iteration: 500 / 4082 
Average train loss: 0.08043031132744696 | Average train f1: 0.7197106377008222
Epoch: 5 / 5 |Iteration: 1000 / 4082 
Average train loss: 0.08132212616495796 | Average train f1: 0.720088445027162
Epoch: 5 / 5 |Iteration: 1500 / 4082 
Average train loss: 0.08131135880033784 | Average train f1: 0.7192394434395784
Epoch: 5 / 5 |Iteration: 2000 / 4082 
Average train loss: 0.08184458019452981 | Average train f1: 0.7160404314533206
Epoch: 5 / 5 |Iteration: 2500 / 4082 
Average train loss: 0.0820724282991655 | Average train f1: 0.7147455709779477
Epoch: 5 / 5 |Iteration: 3000 / 4082 
Average train loss: 0.08240269584490016 | Average train f1: 0.7102570564699858
Epoch: 5 / 5 |Iteration: 3500 / 4082 
Average train loss: 0.08282423328785973 | Average train f1: 0.7087626647349291
Epoch: 5 / 5 |Iteration: 4000 / 4082 
Average train loss: 0.0832158415

HBox(children=(IntProgress(value=0, max=1021), HTML(value='')))

Epoch: 5 / 5 |Iteration: 0 / 4082 
Average val loss: 0.6966636180877686 | Average val f1: 0.7567567567567567
Epoch: 5 / 5 |Iteration: 150 / 4082 
Average val loss: 0.7320115929407789 | Average val f1: 0.5843491030975664
Epoch: 5 / 5 |Iteration: 300 / 4082 
Average val loss: 0.7314662466017511 | Average val f1: 0.5901286834300182
Epoch: 5 / 5 |Iteration: 450 / 4082 
Average val loss: 0.7335175245405565 | Average val f1: 0.5904170187713422
Epoch: 5 / 5 |Iteration: 600 / 4082 
Average val loss: 0.7334974830638549 | Average val f1: 0.5884888052357228
Epoch: 5 / 5 |Iteration: 750 / 4082 
Average val loss: 0.7336311599862242 | Average val f1: 0.5898495219256594
Epoch: 5 / 5 |Iteration: 900 / 4082 
Average val loss: 0.7340013038834244 | Average val f1: 0.5895182676345565
