In [5]:
import logging
import nltk
from nltk import word_tokenize
import pandas as pd
import numpy as np
import gc
import os
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from torch import optim
import torchtext
import random
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bhargav/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
text = torchtext.data.Field(lower=True, batch_first=True, tokenize=word_tokenize)
qid = torchtext.data.Field()
target = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True)
train = torchtext.data.TabularDataset(path='data/train.csv', format='csv',
                                      fields={'question_text': ('text',text),
                                              'target': ('target',target)})
test = torchtext.data.TabularDataset(path='data/test.csv', format='csv',
                                     fields={'qid': ('qid', qid),
                                             'question_text': ('text', text)})
text.build_vocab(train, test, min_freq=3)
qid.build_vocab(test)
text.vocab.load_vectors(torchtext.vocab.Vectors('data/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'))
print(text.vocab.vectors.shape)

  0%|          | 0/999994 [00:00<?, ?it/s]Skipping token b'999994' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 999510/999994 [01:33<00:00, 10740.29it/s]

torch.Size([85065, 300])


In [8]:
random.seed(2018)
train, val = train.split(split_ratio=0.9, random_state=random.getstate())

In [9]:
class BiLSTM(nn.Module):
    def __init__(self, pretrained_lm, padding_idx, static=True, hidden_dim=128, lstm_layer=2, dropout=0.2):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding.from_pretrained(pretrained_lm)
        self.embedding.padding_idx = padding_idx
        if static:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layer, 
                            dropout = dropout,
                            bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*lstm_layer*2, 1)
    
    def forward(self, sents):
        x = self.embedding(sents)
        x = torch.transpose(x, dim0=1, dim1=0)
        lstm_out, (h_n, c_n) = self.lstm(x)
        y = self.hidden2label(self.dropout(torch.cat([c_n[i,:, :] for i in range(c_n.shape[0])], dim=1)))
        return y

100%|█████████▉| 999510/999994 [01:50<00:00, 10740.29it/s]

In [10]:


def training(epoch, model, eval_every, loss_func, optimizer, train_iter, val_iter, early_stop=1, warmup_epoch=2):
    
    step = 0
    max_loss = 1e5
    no_improve_epoch = 0
    no_improve_in_previous_epoch = False
    fine_tuning = False
    train_record = []
    val_record = []
    losses = []
    
    for e in range(epoch):
        if e >= warmup_epoch:
            if no_improve_in_previous_epoch:
                no_improve_epoch += 1
                if no_improve_epoch >= early_stop:
                    break
            else:
                no_improve_epoch = 0
            no_improve_in_previous_epoch = True
        if not fine_tuning and e >= warmup_epoch:
            model.embedding.weight.requires_grad = True
            fine_tuning = True
        train_iter.init_epoch()
        for train_batch in iter(train_iter):
            step += 1
            model.train()
            x = train_batch.text.cuda()
            y = train_batch.target.type(torch.Tensor).cuda()
            model.zero_grad()
            pred = model.forward(x).view(-1)
            loss = loss_function(pred, y)
            losses.append(loss.cpu().data.numpy())
            train_record.append(loss.cpu().data.numpy())
            loss.backward()
            optimizer.step()
            if step % eval_every == 0:
                model.eval()
                model.zero_grad()
                val_loss = []
                for val_batch in iter(val_iter):
                    val_x = val_batch.text.cuda()
                    val_y = val_batch.target.type(torch.Tensor).cuda()
                    val_pred = model.forward(val_x).view(-1)
                    val_loss.append(loss_function(val_pred, val_y).cpu().data.numpy())
                val_record.append({'step': step, 'loss': np.mean(val_loss)})
                print('epcoh {:02} - step {:06} - train_loss {:.4f} - val_loss {:.4f} '.format(
                            e, step, np.mean(losses), val_record[-1]['loss']))
                if e >= warmup_epoch:
                    if val_record[-1]['loss'] <= max_loss:
                        save(m=model, info={'step': step, 'epoch': e, 'train_loss': np.mean(losses),
                                            'val_loss': val_record[-1]['loss']})
                        max_loss = val_record[-1]['loss']
                        no_improve_in_previous_epoch = False
    

def save(m, info):
    torch.save(info, 'best_model.info')
    torch.save(m, 'best_model.m')
    
def load():
    m = torch.load('best_model.m')
    info = torch.load('best_model.info')
    return m, info


In [11]:
batch_size = 128
train_iter = torchtext.data.BucketIterator(dataset=train,
                                               batch_size=batch_size,
                                               sort_key=lambda x: x.text.__len__(),
                                               shuffle=True,
                                               sort=False)
val_iter = torchtext.data.BucketIterator(dataset=val,
                                             batch_size=batch_size,
                                             sort_key=lambda x: x.text.__len__(),
                                             train=False,
                                             sort=False)
model = BiLSTM(text.vocab.vectors, lstm_layer=2, padding_idx=text.vocab.stoi[text.pad_token], hidden_dim=128).cuda()
# loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([pos_w]).cuda())
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                    lr=1e-3)

In [12]:

training(model=model, epoch=20, eval_every=500,
         loss_func=loss_function, optimizer=optimizer, train_iter=train_iter,
        val_iter=val_iter, warmup_epoch=3, early_stop=2)

epcoh 00 - step 000500 - train_loss 0.1487 - val_loss 0.1268 
epcoh 00 - step 001000 - train_loss 0.1366 - val_loss 0.1267 
epcoh 00 - step 001500 - train_loss 0.1330 - val_loss 0.1217 
epcoh 00 - step 002000 - train_loss 0.1301 - val_loss 0.1222 
epcoh 00 - step 002500 - train_loss 0.1282 - val_loss 0.1181 
epcoh 00 - step 003000 - train_loss 0.1265 - val_loss 0.1154 
epcoh 00 - step 003500 - train_loss 0.1249 - val_loss 0.1139 
epcoh 00 - step 004000 - train_loss 0.1240 - val_loss 0.1124 
epcoh 00 - step 004500 - train_loss 0.1230 - val_loss 0.1116 
epcoh 00 - step 005000 - train_loss 0.1219 - val_loss 0.1115 
epcoh 00 - step 005500 - train_loss 0.1209 - val_loss 0.1108 
epcoh 00 - step 006000 - train_loss 0.1199 - val_loss 0.1113 
epcoh 00 - step 006500 - train_loss 0.1194 - val_loss 0.1093 
epcoh 00 - step 007000 - train_loss 0.1186 - val_loss 0.1096 
epcoh 00 - step 007500 - train_loss 0.1180 - val_loss 0.1080 
epcoh 00 - step 008000 - train_loss 0.1174 - val_loss 0.1088 
epcoh 00

  "type " + obj.__name__ + ". It won't be checked "


epcoh 03 - step 028500 - train_loss 0.1058 - val_loss 0.1015 
epcoh 03 - step 029000 - train_loss 0.1057 - val_loss 0.1009 
epcoh 03 - step 029500 - train_loss 0.1055 - val_loss 0.1025 
epcoh 03 - step 030000 - train_loss 0.1053 - val_loss 0.1012 
epcoh 03 - step 030500 - train_loss 0.1051 - val_loss 0.1023 
epcoh 03 - step 031000 - train_loss 0.1049 - val_loss 0.1012 
epcoh 03 - step 031500 - train_loss 0.1047 - val_loss 0.1005 
epcoh 03 - step 032000 - train_loss 0.1046 - val_loss 0.1032 
epcoh 03 - step 032500 - train_loss 0.1044 - val_loss 0.1003 
epcoh 03 - step 033000 - train_loss 0.1042 - val_loss 0.1015 
epcoh 03 - step 033500 - train_loss 0.1041 - val_loss 0.1003 
epcoh 03 - step 034000 - train_loss 0.1039 - val_loss 0.1016 
epcoh 03 - step 034500 - train_loss 0.1038 - val_loss 0.1005 
epcoh 03 - step 035000 - train_loss 0.1037 - val_loss 0.1003 
epcoh 03 - step 035500 - train_loss 0.1035 - val_loss 0.1021 
epcoh 03 - step 036000 - train_loss 0.1034 - val_loss 0.0993 
epcoh 03

In [13]:
model, m_info = load()
m_info

{'step': 44000, 'epoch': 4, 'train_loss': 0.10083235, 'val_loss': 0.09905402}

In [14]:
model.lstm.flatten_parameters()

In [16]:


model.eval()
val_pred = []
val_true = []
val_iter.init_epoch()
for val_batch in iter(val_iter):
    val_x = val_batch.text.cuda()
    val_true += val_batch.target.data.numpy().tolist()
    val_pred += torch.sigmoid(model.forward(val_x).view(-1)).cpu().data.numpy().tolist()



tmp = [0,0,0] # idx, cur, max
delta = 0
for tmp[0] in np.arange(0.1, 0.501, 0.01):
    tmp[1] = f1_score(val_true, np.array(val_pred)>tmp[0])
    if tmp[1] > tmp[2]:
        delta = tmp[0]
        tmp[2] = tmp[1]
print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))



best threshold is 0.3300 with F1 score: 0.6803


In [17]:


model.eval()
model.zero_grad()
test_iter = torchtext.data.BucketIterator(dataset=test,
                                    batch_size=batch_size,
                                    sort_key=lambda x: x.text.__len__(),
                                    sort=True)
test_pred = []
test_id = []

for test_batch in iter(test_iter):
    test_x = test_batch.text.cuda()
    test_pred += torch.sigmoid(model.forward(test_x).view(-1)).cpu().data.numpy().tolist()
    test_id += test_batch.qid.view(-1).data.numpy().tolist()
    



In [18]:
sub_df =pd.DataFrame()
sub_df['qid'] = [qid.vocab.itos[i] for i in test_id]
sub_df['prediction'] = (np.array(test_pred) >= delta).astype(int)

In [19]:
sub_df.to_csv("submission.csv", index=False)