In [1]:
import gc
import math
import os
import pytz
import random
import time
import warnings
from tqdm.autonotebook import tqdm
import codecs
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import wandb
from scipy import stats
from nltk import ngrams
from text_unidecode import unidecode
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


import torch
import transformers

import torch.nn as nn
from torch.cuda.amp import GradScaler, autocast
from torch.nn.functional import one_hot
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset,WeightedRandomSampler
from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, log_loss
from sklearn.model_selection import GroupKFold, StratifiedKFold

%env TOKENIZERS_PARALLELISM=true
warnings.filterwarnings("ignore")

print(torch.cuda.get_device_name() if torch.cuda.is_available() else 'No Cuda')
print(f'torch.__version__: {torch.__version__}')
print(f'transformers.__version__: {transformers.__version__}')

In [2]:
wandb.login(key='f1dfc080cc5d63892dc49e00fa50b664e85706f8')
run = wandb.init(project='LSTM',
                  name='exp_9',
                  save_code=True)

In [3]:
def softmax(x):
    
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x

In [4]:
f0 = pd.read_csv('../input/preds-by-v5dbls42/prediction0.csv')
f1 = pd.read_csv('../input/preds-by-v5dbls42/prediction1.csv')
f2 = pd.read_csv('../input/preds-by-v5dbls42/prediction2.csv')
f3 = pd.read_csv('../input/preds-by-v5dbls42/prediction3.csv')
f4 = pd.read_csv('../input/preds-by-v5dbls42/prediction4.csv')
preds = pd.read_csv('../input/predict-test-set/predicted_test_by_v5DBlS42.csv')

In [5]:
df_train = pd.read_csv('../input/for-lstm2/TRAIN_CATBOOST.csv')
df_test = pd.read_csv('../input/for-lstm2/TEST_CATBOOST.csv')

In [6]:
df_test = df_test.merge(preds, left_on='discourse_id', right_on='discourse_id')

In [7]:
f_full = pd.concat([f0, f1, f2, f3, f4])[['discourse_id', 'Ineffective', 'Adequate', 'Effective']]
f_full[['Ineffective', 'Adequate', 'Effective']] = f_full[['Ineffective', 'Adequate', 'Effective']].apply(softmax, axis=1)

In [8]:
df_train = df_train.merge(f_full, left_on='discourse_id', right_on='discourse_id')

In [9]:
class CFG:
    accomulate = 4
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    lr = 1.e-4
    epochs = 100

In [11]:
class DatasetLSTM(Dataset):
    def __init__(self, df):
        self.df = df
        self.essay_id = self.df.essay_id.unique()
    
    def __len__(self):
        return len(self.essay_id)
    
    def __getitem__(self, item):
        span_df = self.df[self.df.essay_id == self.essay_id[item]].sort_values(by='start_pos')
        disc_id = span_df.discourse_id.tolist()
        inputs = span_df.loc[:, ['code_topic', 'code_dis_type', 'start_pos', 'end_pos', 'len_text',
                   'len_essay', 'size_essay', 'Ineffective', 'Adequate', 'Effective']].values
        label = np.zeros((len(span_df), 3))
        label[np.arange(len(span_df)), span_df.label.tolist()] = 1.0

        return disc_id, {
                        'inputs': torch.tensor(inputs, dtype=torch.float),
                        'label':  torch.tensor(label, dtype=torch.float)
                        }

In [12]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [13]:
def crit(y_preds, y_true):
    return (-y_true*y_preds.log()).sum(dim=1).mean()

In [14]:
def val_step(model, data_loader, cfg=CFG):
    model.eval()
    loss_stat = AverageMeter()
    _out = []
    _lab = []
    for step, (disc_id, inputs) in enumerate(data_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.device)
        with torch.no_grad():
            out = model(inputs['inputs'])
        _out.append(out.to('cpu'))
        _lab.append(inputs['label'][0].to('cpu'))
    _out = torch.concat(_out).softmax(dim=1).numpy()
    _lab = torch.concat(_lab).numpy()
    return log_loss(_lab, _out)
        

In [25]:
def train_loop(model, train, test, cfg):
    best_loss = float('inf')
    train_dataset = DatasetLSTM(train)
    test_dataset = DatasetLSTM(test)
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    it = tqdm(range(cfg.epochs), desc='Training.',  total=len(range(cfg.epochs)))
    param_lrs = [{'params': param, 'lr': cfg.lr} for param in model.parameters()]
    optimizer = torch.optim.AdamW(param_lrs, lr=cfg.lr)
    scheduler =  get_cosine_schedule_with_warmup(optimizer=optimizer,
                                               num_warmup_steps=0,
                                               num_training_steps=len(train_loader)*cfg.epochs,
                                               num_cycles=0.5)
    

    for epoch in it:
        loss_stat = AverageMeter()
        model.train()
        for step, (disc_id, inputs) in enumerate(train_loader):
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            out = model(inputs['inputs'])
#             print(out)
#             print(inputs['label'][0])
            loss = loss_func(out, inputs['label'][0])
            loss_stat.update(loss, 1)
            loss = loss / cfg.accomulate
            loss.backward()
            if (step+1) % cfg.accomulate == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            it.set_description(
                    f'Training | '
                    f'epoch: {epoch} | '
                    f'loss: {loss_stat.avg:.5f} | ' )
        avg_val_loss = val_step(model, test_loader, cfg)
        model.train()
        wandb.log({'val_loss': avg_val_loss})
        if best_loss > avg_val_loss:
            print(f'impruved best_loss form {best_loss} to {avg_val_loss}')
            best_loss = avg_val_loss
            path = OUTPUT + 'LSTM_final'
            torch.save({'model': model.state_dict()}, path+'.pth')

In [26]:
loss_func = nn.CrossEntropyLoss(reduction='mean')

In [27]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(11, 3, num_layers=3)
#         self.seq = nn.Sequential(nn.Linear(64, 16),
#                                  nn.LayerNorm(16),
#                                  nn.Linear(16, 3))
    
    def forward(self, inputs):
        result = self.lstm(inputs)[0]
        #res = self.seq(result)
        return result[0]

In [None]:
if __name__ == '__main__':
    OUTPUT = './'
    model = Model()
    model.to(CFG.device)
    train_loop(model, df_train, df_test, CFG)