In [1]:
# basics
import os
import time
import re
import regex
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import random
import matplotlib.pyplot as plt
from multiprocessing import Pool

# machine learning
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_curve, precision_recall_curve
from sklearn.preprocessing import StandardScaler

# nn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data

# use only for tokenizer and padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
cuda_idx = 1

In [3]:
all_start = time.time()

In [4]:
def seed_torch(seed=1019):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# SEED = 1019
# seed_torch(SEED)

In [5]:
# model parameters
class Config:
    num_epochs = 15
    batch_size = 512
    test_batch_size = 512
    vocab_size = 120000
    max_length = 72
    embedding_size = 300
    hidden_size = 64
    num_layers = 1
    embedding_noise_var = 0.1
    embedding_dropout = 0.3
    layer_dropout = 0.1
    dense_size = [hidden_size*2*4, int(hidden_size/4)] # depend on concat num
    output_size = 1
    num_cv_splits = 5
    learning_rate = 0.001
    clip_grad = 5.0
    embeddings = ['glove', 'paragram', 'fasttext']
    datadir = Path('./data/')
    # datadir = Path('../input') # for kernel

c = Config()

In [6]:
puncts = [
    ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '¬', '░', '¶', '↑', '±',  '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '₹', '´'
]

In [7]:
abbreviations = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
    "who'd": "who would",
    "who're": "who are",
    "'re": " are",
    "tryin'": "trying",
    "doesn'": "does not",
    'howdo': 'how do',
    'whatare': 'what are',
    'howcan': 'how can',
    'howmuch': 'how much',
    'howmany': 'how many',
    'whydo': 'why do',
    'doI': 'do I',
    'theBest': 'the best',
    'howdoes': 'how does',
}

In [8]:
spells = {
    'colour': 'color',
    'centre': 'center',
    'favourite': 'favorite',
    'travelling': 'traveling',
    'counselling': 'counseling',
    'theatre': 'theater',
    'cancelled': 'canceled',
    'labour': 'labor',
    'organisation': 'organization',
    'wwii': 'world war 2',
    'citicise': 'criticize',
    'youtu.be': 'youtube',
    'youtu ': 'youtube ',
    'qoura': 'quora',
    'sallary': 'salary',
    'Whta': 'what',
    'whta': 'what',
    'narcisist': 'narcissist',
    'mastrubation': 'masturbation',
    'mastrubate': 'masturbate',
    "mastrubating": 'masturbating',
    'pennis': 'penis',
    'Etherium': 'ethereum',
    'etherium': 'ethereum',
    'narcissit': 'narcissist',
    'bigdata': 'big data',
    '2k17': '2017',
    '2k18': '2018',
    'qouta': 'quota',
    'exboyfriend': 'ex boyfriend',
    'exgirlfriend': 'ex girlfriend',
    'airhostess': 'air hostess',
    "whst": 'what',
    'watsapp': 'whatsapp',
    'demonitisation': 'demonetization',
    'demonitization': 'demonetization',
    'demonetisation': 'demonetization',
    'quorans': 'quora user',
    'quoran': 'quora user',
    'pokémon': 'pokemon',
}

In [9]:
def load_data(datadir):
    train_df = pd.read_csv(datadir / 'train_local.csv')
    test_df = pd.read_csv(datadir / 'test_local.csv')
    print("Train shape : ", train_df.shape)
    print("Test shape : ", test_df.shape)
    return train_df, test_df

def clean(df):
    df = clean_lower(df)
    df = clean_unicode(df)
    df = clean_abbreviation(df, abbreviations)
    df = clean_spells(df, spells)
    df = clean_language(df)
    df = clean_puncts(df, puncts)
    df = clean_space(df)
    return df

def clean_unicode(df):
    codes = ['\x7f', '\u200b', '\xa0', '\ufeff', '\u200e', '\u202a', '\u202c', '\u2060', '\uf0d8', '\ue019', '\uf02d', '\u200f', '\u2061', '\ue01b']
    df["question_text"] = df["question_text"].apply(lambda x: _clean_unicode(x, codes))
    return df

def _clean_unicode(x, codes):
    for u in codes:
        if u in x:
            x = x.replace(u, '')
    return x

def clean_language(df):
    langs1 = r'[\p{Katakana}\p{Hiragana}\p{Han}]' # regex
    langs2 = r'[ஆய்தஎழுத்துஆயுதஎழுத்துशुषछछशुषدوउसशुष북한내제តើបងប្អូនមានមធ្យបាយអ្វីខ្លះដើម្បីរកឃើញឯកសារអំពីប្រវត្តិស្ត្រនៃប្រាសាទអង្គរវट्टरौरआदસંઘરાજ્યपीतऊनअहএকটিবাড়িএকটিখামারএরঅধীনেপদেরবাছাইপরীক্ষাএরপ্রশ্নওউত্তরসহকোথায়পেতেপারিص、。Емелядуракلكلمقاممقال수능ί서로가를행복하게기乡국고등학교는몇시간업니《》싱관없어나이रचा키کپڤ」मिलगईकलेजेकोठंडकऋॠऌॡर]'
    compiled_langs1 = regex.compile(langs1)
    compiled_langs2 = re.compile(langs2)
    df['question_text'] = df['question_text'].apply(lambda x: _clean_language(x, compiled_langs1))
    df['question_text'] = df['question_text'].apply(lambda x: _clean_language(x, compiled_langs2))
    return df

def _clean_language(x, compiled_re):
    return compiled_re.sub(' <lang> ', x)

def clean_lower(df):
    df["question_text"] = df["question_text"].apply(lambda x: x.lower())
    return df

def clean_puncts(df, puncts):
    df['question_text'] = df['question_text'].apply(lambda x: _clean_puncts(x, puncts))
    return df
    
def _clean_puncts(x, puncts):
    x = str(x)
    # added space around puncts after replace
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def clean_spells(df, spells):
    compiled_spells = re.compile('(%s)' % '|'.join(spells.keys()))
    def replace(match):
        return spells[match.group(0)]
    df['question_text'] = df["question_text"].apply(
        lambda x: _clean_spells(x, compiled_spells, replace)
    )
    return df
    
def _clean_spells(x, compiled_re, replace):
    return compiled_re.sub(replace, x)

def clean_abbreviation(df, abbreviations):
    compiled_abbreviation = re.compile('(%s)' % '|'.join(abbreviations.keys()))
    def replace(match):
        return abbreviations[match.group(0)]
    df['question_text'] = df["question_text"].apply(
        lambda x: _clean_abreviation(x, compiled_abbreviation, replace)
    )
    return df
    
def _clean_abreviation(x, compiled_re, replace):
    return compiled_re.sub(replace, x)

def clean_space(df):
    compiled_re = re.compile(r"\s+")
    df['question_text'] = df["question_text"].apply(lambda x: _clean_space(x, compiled_re))
    return df

def _clean_space(x, compiled_re):
    return compiled_re.sub(" ", x)
        
def prepare_tokenizer(texts, max_words):
    tokenizer = Tokenizer(num_words=max_words, filters='', oov_token='<unk>')
    tokenizer.fit_on_texts(list(texts))
    return tokenizer

def tokenize_and_padding(texts, tokenizer, max_length):
    texts = tokenizer.texts_to_sequences(texts)
    texts = pad_sequences(texts, maxlen=max_length)
    return texts

def get_all_vocabs(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [10]:
class Embeddings(nn.Module):
    
    def __init__(self, config: Config, tokenizer, all_vocabs, embedding_weights = None):
        super(Embeddings, self).__init__()
        
        self.embedding_map = {
            'fasttext': self._load_fasttext,
            'glove': self._load_glove,
            'paragram': self._load_paragram
        }
        self.c = config
        self.tokenizer = tokenizer
        self.all_vocabs = all_vocabs
        
        if embedding_weights is None:
            embedding_weights = self._load_embeddings(self.c.embeddings)
            
        self.original_embedding_weights = embedding_weights
        self.embeddings = nn.Embedding(self.c.vocab_size + 1, self.c.embedding_size, padding_idx=0)
        self.embeddings.weight = nn.Parameter(embedding_weights)
        self.embeddings.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout2d(self.c.embedding_dropout)
        
    def forward(self, x):
        embedding = self.embeddings(x)
        if self.training:
            embedding += torch.randn_like(embedding) * self.c.embedding_noise_var
        return embedding
    
    def reset_weights(self):
        self.embeddings.weight = nn.Parameter(self.original_embedding_weights)
        self.embeddings.weight.requires_grad = False
    
    def _load_embeddings(self, embedding_list: list):
        embedding_weights = np.zeros((self.c.vocab_size, self.c.embedding_size))
        pool = Pool(num_cores)
        embedding_weights = np.mean(pool.map(self._load_an_embedding, embedding_list), 0)
        pool.close()
        pool.join()
        return torch.tensor(embedding_weights, dtype=torch.float32)

    def _load_an_embedding(self, emb):
        return self.embedding_map[emb](self.tokenizer.word_index)
        
    def _get_embeddings_pair(self, word, *arr): 
        return word, np.asarray(arr, dtype='float32')
        
    def _make_embeddings(self, embeddings_index, word_index, emb_mean, emb_std):
        nb_words = min(self.c.vocab_size, len(word_index))
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, self.c.embedding_size))
        embedding_matrix[0] = np.zeros(self.c.embedding_size)
        for word, i in word_index.items():
            if i >= self.c.vocab_size:
                continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        return embedding_matrix
    
    def _load_glove(self, word_index):
        print('loading glove')
        filepath = self.c.datadir / 'embeddings/glove.840B.300d/glove.840B.300d.txt'
        embeddings_index = dict(
            self._get_embeddings_pair(*o.split(" "))
            for o in open(filepath)
            if o.split(" ")[0] in word_index
        )
        emb_mean, emb_std = -0.005838499, 0.48782197
        return self._make_embeddings(embeddings_index, word_index, emb_mean, emb_std)
    
    def _load_fasttext(self, word_index):    
        print('loading fasttext')
        filepath = self.c.datadir / 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
        embeddings_index = dict(
            self._get_embeddings_pair(*o.split(" "))
            for o in open(filepath)
            if len(o) > 100 and o.split(" ")[0] in word_index
        )
        emb_mean, emb_std = -0.0033469985, 0.109855495
        return self._make_embeddings(embeddings_index, word_index, emb_mean, emb_std)

    def _load_paragram(self, word_index):
        print('loading paragram')
        filepath = self.c.datadir / 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
        embeddings_index = dict(
            self._get_embeddings_pair(*o.split(" "))
            for o in open(filepath, encoding="utf8", errors='ignore')
            if len(o) > 100 and o.split(" ")[0] in word_index
        )
        emb_mean, emb_std = -0.0053247833, 0.49346462
        return self._make_embeddings(embeddings_index, word_index, emb_mean, emb_std)

In [11]:
num_cores = 2
def df_parallelize_run(df, func, num_cores=2):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [12]:
train_df, test_df = load_data(c.datadir)
train_df = df_parallelize_run(train_df, clean)
test_df = df_parallelize_run(test_df, clean)
train_x, train_y = train_df['question_text'].values, train_df['target'].values
test_x = test_df['question_text'].values
tokenizer = prepare_tokenizer(train_x, c.vocab_size)
train_x = tokenize_and_padding(train_x, tokenizer, c.max_length)
test_x = tokenize_and_padding(test_x, tokenizer, c.max_length)

Train shape :  (1175509, 3)
Test shape :  (130613, 3)


In [13]:
start = time.time()
all_vocabs = get_all_vocabs(train_df['question_text'])
print('all_vocabs: ', len(all_vocabs))
embeddings = Embeddings(c, tokenizer, all_vocabs)
print(time.time() - start)

all_vocabs:  184279
loading glove
loading paragram
loading fasttext
48.77651238441467


In [14]:
class GRULayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(GRULayer, self).__init__()
        
        self.gru = nn.GRU(input_size=input_size,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
                          bias=False,
                          bidirectional=True,
                          batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        
        self.init_weights()
        
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        gru_outputs, gru_state = self.gru(x)
        return self.dropout(gru_outputs), gru_state

In [15]:
class LSTMLayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(LSTMLayer, self).__init__()
        
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            bias=False,
                            bidirectional=True,
                            batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        
        self.init_weights()
        
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        lstm_outputs, (lstm_states, _) = self.lstm(x)
        return self.dropout(lstm_outputs), lstm_states

In [16]:
class SimpleRNN(nn.Module):
    def __init__(self, config: Config, embeddings):
        super(SimpleRNN, self).__init__()
        self.c = config
        
        self.embedding = embeddings
        self.lstm1 = LSTMLayer(input_size=self.c.embedding_size,
                              hidden_size=self.c.hidden_size,
                              num_layers=self.c.num_layers,
                              dropout_rate=self.c.layer_dropout)
        self.lstm2 = LSTMLayer(input_size=self.c.hidden_size*2,
                            hidden_size=self.c.hidden_size,
                            num_layers=self.c.num_layers,
                            dropout_rate=self.c.layer_dropout)
        
        self.cell_dropout = nn.Dropout(self.c.layer_dropout)
        self.linear = nn.Linear(self.c.dense_size[0], self.c.dense_size[1])
        self.batch_norm = torch.nn.BatchNorm1d(self.c.dense_size[1])
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(self.c.layer_dropout)
        self.out = nn.Linear(self.c.dense_size[1], self.c.output_size)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        o_lstm1, h_lstm1 = self.lstm1(h_embedding)
        o_lstm2, h_lstm2 = self.lstm2(o_lstm1)
        
        avg_pool = torch.mean(o_lstm2, 1)
        max_pool, _ = torch.max(o_lstm2, 1)
        
        h_lstm1 = self.cell_dropout(torch.cat(h_lstm1.split(1, 0), -1).squeeze(0))
        h_lstm2 = self.cell_dropout(torch.cat(h_lstm2.split(1, 0), -1).squeeze(0))

        concat = torch.cat([h_lstm1, h_lstm2, avg_pool, max_pool], 1)
        concat = self.linear(concat)
        concat = self.batch_norm(concat)
        concat = self.relu(concat)
        concat = self.dropout(concat)
        out = self.out(concat)
        
        return out

In [17]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def threshold_search(y_true, y_proba, plot=False):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    if plot:
        plt.plot(thresholds, F, '-b')
        plt.plot([best_th], [best_score], '*r')
        plt.show()
    search_result = {'threshold': best_th , 'f1': best_score}
    return search_result 

In [18]:
def cut_length(data, mask):
    max_length = data.shape[1]
    transposed = torch.transpose(data, 1, 0)
    res = (transposed == mask).all(1)
    for i, r in enumerate(res):
        if r == 0:
            break
    data = data[:, -(max_length - i):]
    return data

In [19]:
def training(train_x, train_y, test_x, c, embeddings, trial=0):
    splits = list(StratifiedKFold(n_splits=c.num_cv_splits, shuffle=True).split(train_x, train_y))
    x_test_cuda = torch.tensor(test_x, dtype=torch.long).cuda(cuda_idx)
    test = torch.utils.data.TensorDataset(x_test_cuda)
    test_loader = torch.utils.data.DataLoader(test, batch_size=c.test_batch_size, shuffle=False)
    train_preds = np.zeros((len(train_x)))
    test_preds = np.zeros((len(test_x)))

    mask = torch.zeros((c.max_length, 1), dtype=torch.long).cuda(cuda_idx)
    
    for i, (train_idx, valid_idx) in enumerate(splits):
        x_train_fold = torch.tensor(train_x[train_idx], dtype=torch.long).cuda(cuda_idx)
        y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda(cuda_idx)
        x_val_fold = torch.tensor(train_x[valid_idx], dtype=torch.long).cuda(cuda_idx)
        y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda(cuda_idx)

        model = SimpleRNN(c, embeddings)
        model.cuda(cuda_idx)

        loss_fn = torch.nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=c.learning_rate)

        train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
        valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
        train_loader = torch.utils.data.DataLoader(train, batch_size=c.batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=c.test_batch_size, shuffle=False)
        
        best_f1 = 0.0
        best_epoch = 0

        print(f'Fold {i + 1}')

        for epoch in range(c.num_epochs):
            start_time = time.time()

            model.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = cut_length(x_batch, mask)
                y_pred = model(x_batch)
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), c.clip_grad)
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)

            model.eval()
            valid_preds_fold = np.zeros((x_val_fold.size(0)))
            avg_val_loss = 0.

            # validation prediction
            for i, (x_batch, y_batch) in enumerate(valid_loader):
                x_batch = cut_length(x_batch, mask)
                y_pred = model(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                valid_preds_fold[i * c.test_batch_size:(i+1) * c.test_batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            search_result = threshold_search(y_val_fold.cpu().numpy(), valid_preds_fold)
            valid_pred_targets = valid_preds_fold > search_result['threshold']
            val_f1 = f1_score(y_val_fold.cpu().numpy(), valid_pred_targets)

            elapsed_time = time.time() - start_time 
            print('Epoch {}/{}  loss={:.4f}  val_loss={:.4f}  f1={:.3f}  time={:.2f}s'.format(
                epoch + 1, c.num_epochs, avg_loss, avg_val_loss, val_f1, elapsed_time))
            if best_f1 < val_f1:
                print(f'model_saved at f1: {val_f1} from {best_f1}')
                ckpt_path = Path(f'./ckpt/gauss/{trial}/')
                if not ckpt_path.exists():
                    ckpt_path.mkdir(parents=True)
                torch.save(model.state_dict(),ckpt_path / f'{i}_model.pt')
                best_f1 = val_f1
                best_epoch = epoch

        # test prediction
        model.load_state_dict(torch.load(f'./ckpt/gauss/{trial}/{i}_model.pt'))  # load best model
        test_preds_fold = np.zeros(len(test_x))
        for i, (x_batch, ) in enumerate(test_loader):
            x_batch = cut_length(x_batch, mask)
            y_pred = model(x_batch).detach()
            test_preds_fold[i * c.test_batch_size:(i+1) * c.test_batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

        train_preds[valid_idx] = valid_preds_fold
        test_preds += test_preds_fold / len(splits)
    return train_preds, test_preds

In [20]:
train_preds, test_preds = training(train_x, train_y, test_x, c, embeddings)
search_result = threshold_search(train_y, train_preds)
print(search_result)
test_pred_targets = test_preds > search_result['threshold']
f1 = f1_score(test_df['target'], test_pred_targets)
print('f1 score:', f1)

Fold 1


  import sys


Epoch 1/15  loss=0.1557  val_loss=0.1100  f1=0.641  time=65.89s
model_saved at f1: 0.640981626295378 from 0.0
Epoch 2/15  loss=0.1083  val_loss=0.1031  f1=0.665  time=124.11s
model_saved at f1: 0.6650526179869586 from 0.640981626295378
Epoch 3/15  loss=0.1027  val_loss=0.1025  f1=0.676  time=125.97s
model_saved at f1: 0.6756417498274022 from 0.6650526179869586
Epoch 4/15  loss=0.0987  val_loss=0.0999  f1=0.681  time=125.51s
model_saved at f1: 0.681150240689131 from 0.6756417498274022
Epoch 5/15  loss=0.0952  val_loss=0.1003  f1=0.686  time=126.21s
model_saved at f1: 0.6857984754339889 from 0.681150240689131
Epoch 6/15  loss=0.0921  val_loss=0.0994  f1=0.685  time=125.68s
Epoch 7/15  loss=0.0893  val_loss=0.0999  f1=0.687  time=125.59s
model_saved at f1: 0.6865080632284848 from 0.6857984754339889
Epoch 8/15  loss=0.0870  val_loss=0.1008  f1=0.685  time=125.79s
Epoch 9/15  loss=0.0841  val_loss=0.1016  f1=0.682  time=126.31s
Epoch 10/15  loss=0.0815  val_loss=0.1002  f1=0.685  time=125.8

In [21]:
train_preds, test_preds = training(train_x, train_y, test_x, c, embeddings)
search_result = threshold_search(train_y, train_preds)
print(search_result)
test_pred_targets = test_preds > search_result['threshold']
f1 = f1_score(test_df['target'], test_pred_targets)
print('f1 score:', f1)

Fold 1


  import sys


Epoch 1/15  loss=0.1579  val_loss=0.1117  f1=0.647  time=123.33s
model_saved at f1: 0.6472036359045574 from 0.0
Epoch 2/15  loss=0.1073  val_loss=0.1077  f1=0.669  time=125.75s
model_saved at f1: 0.6691616287624348 from 0.6472036359045574
Epoch 3/15  loss=0.1017  val_loss=0.1039  f1=0.677  time=126.19s
model_saved at f1: 0.6767928758644647 from 0.6691616287624348
Epoch 4/15  loss=0.0980  val_loss=0.1003  f1=0.681  time=125.79s
model_saved at f1: 0.6808074991201971 from 0.6767928758644647
Epoch 5/15  loss=0.0946  val_loss=0.1035  f1=0.681  time=125.74s
Epoch 6/15  loss=0.0919  val_loss=0.1042  f1=0.687  time=126.27s
model_saved at f1: 0.687035560516664 from 0.6808074991201971
Epoch 7/15  loss=0.0890  val_loss=0.1024  f1=0.686  time=125.66s
Epoch 8/15  loss=0.0865  val_loss=0.1001  f1=0.686  time=126.06s
Epoch 9/15  loss=0.0839  val_loss=0.1052  f1=0.686  time=126.29s
Epoch 10/15  loss=0.0815  val_loss=0.1032  f1=0.684  time=125.88s
Epoch 11/15  loss=0.0793  val_loss=0.1095  f1=0.685  ti

In [22]:
train_preds, test_preds = training(train_x, train_y, test_x, c, embeddings)
search_result = threshold_search(train_y, train_preds)
print(search_result)
test_pred_targets = test_preds > search_result['threshold']
f1 = f1_score(test_df['target'], test_pred_targets)
print('f1 score:', f1)

Fold 1


  import sys


Epoch 1/15  loss=0.1585  val_loss=0.1085  f1=0.647  time=123.42s
model_saved at f1: 0.6469420429074608 from 0.0
Epoch 2/15  loss=0.1081  val_loss=0.1040  f1=0.666  time=125.71s
model_saved at f1: 0.6656043850983141 from 0.6469420429074608
Epoch 3/15  loss=0.1022  val_loss=0.1040  f1=0.677  time=124.67s
model_saved at f1: 0.6772950016334531 from 0.6656043850983141
Epoch 4/15  loss=0.0978  val_loss=0.1035  f1=0.681  time=125.42s
model_saved at f1: 0.6814071636259067 from 0.6772950016334531
Epoch 5/15  loss=0.0946  val_loss=0.1041  f1=0.684  time=125.88s
model_saved at f1: 0.6842207288705092 from 0.6814071636259067
Epoch 6/15  loss=0.0913  val_loss=0.1009  f1=0.685  time=125.80s
model_saved at f1: 0.6847258485639687 from 0.6842207288705092
Epoch 7/15  loss=0.0885  val_loss=0.1003  f1=0.683  time=124.93s
Epoch 8/15  loss=0.0858  val_loss=0.1074  f1=0.685  time=125.72s
model_saved at f1: 0.6849191231072225 from 0.6847258485639687
Epoch 9/15  loss=0.0832  val_loss=0.1103  f1=0.685  time=126.

In [23]:
train_preds, test_preds = training(train_x, train_y, test_x, c, embeddings)
search_result = threshold_search(train_y, train_preds)
print(search_result)
test_pred_targets = test_preds > search_result['threshold']
f1 = f1_score(test_df['target'], test_pred_targets)
print('f1 score:', f1)

Fold 1


  import sys


Epoch 1/15  loss=0.1589  val_loss=0.1076  f1=0.651  time=123.83s
model_saved at f1: 0.6507530364372469 from 0.0
Epoch 2/15  loss=0.1083  val_loss=0.1034  f1=0.667  time=125.64s
model_saved at f1: 0.6672749193522108 from 0.6507530364372469
Epoch 3/15  loss=0.1025  val_loss=0.1020  f1=0.675  time=125.71s
model_saved at f1: 0.67471712465532 from 0.6672749193522108
Epoch 4/15  loss=0.0985  val_loss=0.0997  f1=0.682  time=125.60s
model_saved at f1: 0.6824446345404428 from 0.67471712465532
Epoch 5/15  loss=0.0950  val_loss=0.0984  f1=0.685  time=125.72s
model_saved at f1: 0.6848344978027884 from 0.6824446345404428
Epoch 6/15  loss=0.0921  val_loss=0.0997  f1=0.687  time=126.04s
model_saved at f1: 0.6866495760205088 from 0.6848344978027884
Epoch 7/15  loss=0.0893  val_loss=0.0972  f1=0.688  time=125.55s
model_saved at f1: 0.6875427677669523 from 0.6866495760205088
Epoch 8/15  loss=0.0867  val_loss=0.0984  f1=0.688  time=125.90s
model_saved at f1: 0.6877034011869058 from 0.6875427677669523
Epo

In [24]:
train_preds, test_preds = training(train_x, train_y, test_x, c, embeddings)
search_result = threshold_search(train_y, train_preds)
print(search_result)
test_pred_targets = test_preds > search_result['threshold']
f1 = f1_score(test_df['target'], test_pred_targets)
print('f1 score:', f1)

Fold 1


  import sys


Epoch 1/15  loss=0.1530  val_loss=0.1082  f1=0.645  time=67.82s
model_saved at f1: 0.6450403877221325 from 0.0
Epoch 2/15  loss=0.1079  val_loss=0.1059  f1=0.673  time=68.20s
model_saved at f1: 0.6725951424746606 from 0.6450403877221325
Epoch 3/15  loss=0.1020  val_loss=0.0994  f1=0.682  time=67.98s
model_saved at f1: 0.6818716798676718 from 0.6725951424746606
Epoch 4/15  loss=0.0983  val_loss=0.0982  f1=0.688  time=67.84s
model_saved at f1: 0.6882559752282038 from 0.6818716798676718
Epoch 5/15  loss=0.0948  val_loss=0.0991  f1=0.690  time=68.25s
model_saved at f1: 0.6899415963659962 from 0.6882559752282038
Epoch 6/15  loss=0.0918  val_loss=0.0981  f1=0.693  time=67.98s
model_saved at f1: 0.6933039047031712 from 0.6899415963659962
Epoch 7/15  loss=0.0892  val_loss=0.0976  f1=0.693  time=67.87s
Epoch 8/15  loss=0.0866  val_loss=0.1022  f1=0.693  time=68.28s
Epoch 9/15  loss=0.0840  val_loss=0.1011  f1=0.690  time=67.85s
Epoch 10/15  loss=0.0819  val_loss=0.1026  f1=0.689  time=67.75s
Ep