In [1]:
from pathlib import Path
import os

DATA_PATH = Path("/kaggle/input/interiitahsg/")
CURRENT_PATH = Path("/kaggle/working/")

os.makedirs("models", exist_ok=True)

In [2]:
import numpy as np
import pandas as pd 

import transformers
import torch
import pytorch_lightning as pl

from argparse import ArgumentParser

class BinaryClassificationDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        args,
        main_text,
        mobile_tech_label,
    ):
        self.hparams = args
        self.main_text = main_text
        self.mobile_tech_label = mobile_tech_label
        
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(args.base_path)

    def __len__(self):
        return len(self.mobile_tech_label)

    def __getitem__(self, idx):
        # print("idx: ", idx)
        main_text = self.main_text[idx]
        mobile_tech_label = self.mobile_tech_label[idx]

        inputs = self.tokenizer(
            main_text,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.hparams.maxlen,
        )
        return {
            "ids_seq": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "attn_masks": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "target": torch.tensor(mobile_tech_label, dtype=torch.float),
        }
        
class BinaryClassificationDataModule(pl.LightningDataModule):
    
    def __init__(self, args):
        super().__init__()
        self.hparams = args
        
    def train_dataloader(self):
        article = pd.read_pickle(DATA_PATH/"data/article_train_cleaned.pkl")
        tweet = pd.read_pickle(DATA_PATH/"data/tweet_train_cleaned.pkl")
        
        article = article.loc[:,["Text", "Mobile_Tech_Flag"]]
        tweet = tweet.loc[:,["Tweet_with_emoji_desc", "Mobile_Tech_Tag"]].rename(columns={"Tweet_with_emoji_desc":"Text", "Mobile_Tech_Tag":"Mobile_Tech_Flag"})

        combined = pd.concat([article, tweet]).sample(frac=1.0)

        ds = BinaryClassificationDataset(self.hparams, combined.Text.to_list(), combined.Mobile_Tech_Flag.to_list())

        return torch.utils.data.DataLoader(
            ds,
            batch_size=self.hparams.batch_size,
            shuffle=True,
            num_workers=8,
            drop_last=True,
        )
    
    def val_dataloader(self):
        article = pd.read_pickle(DATA_PATH/"data/article_dev_cleaned.pkl")
        tweet = pd.read_pickle(DATA_PATH/"data/tweet_dev_cleaned.pkl")
        
        article = article.loc[:,["Text", "Mobile_Tech_Flag"]]
        tweet = tweet.loc[:,["Tweet_with_emoji_desc", "Mobile_Tech_Tag"]].rename(columns={"Tweet_with_emoji_desc":"Text", "Mobile_Tech_Tag":"Mobile_Tech_Flag"})

        combined = pd.concat([article, tweet])

        ds = BinaryClassificationDataset(self.hparams, combined.Text.to_list(), combined.Mobile_Tech_Flag.to_list())

        return torch.utils.data.DataLoader(
            ds,
            batch_size=self.hparams.batch_size*2,
            shuffle=False,
            num_workers=8,
            drop_last=False,
        )

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
import torch.nn as nn
import pytorch_lightning as pl

import transformers
from transformers import AdamW, get_cosine_schedule_with_warmup


class MainModel(nn.Module):
    def __init__(self, args=None, **kwargs):
        super().__init__()
        self.base = transformers.AutoModel.from_pretrained(args.base_path)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768,1)

    def forward(self, ids_seq, attn_masks, token_type_ids=None):
        base_out = self.base(
            ids_seq, attention_mask=attn_masks, token_type_ids=token_type_ids
        )
        # using maxpooled output
        max_out = self.dropout(base_out[1])
        return self.linear(max_out)


class SequenceClassicationLightningModule(pl.LightningModule):
    def __init__(self, args, **kwargs):
        super().__init__()

        self.save_hyperparameters(args)
        self.model = MainModel(self.hparams)

    @staticmethod
    def loss(logits, targets):
        return nn.BCEWithLogitsLoss()(logits, targets)

    def shared_step(self, batch):
        ids_seq, attn_masks, target = (
            batch["ids_seq"],
            batch["attn_masks"],
            batch["target"],
        )
        logits = self.model(ids_seq, attn_masks).squeeze()
        loss = self.loss(logits, target)
        return logits, loss

    def training_step(self, batch, batch_idx):
        logits, loss = self.shared_step(batch)

        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        return {"loss": loss, "logits": logits, "true_preds": batch["target"]}

    def validation_step(self, batch, batch_idx):
        logits, loss = self.shared_step(batch)

        self.log(
            "valid_loss",
            loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return {"logits": logits, "true_preds": batch["target"]}

    def configure_optimizers(self):
        grouped_parameters = [
            {"params": self.model.base.parameters(), "lr": self.hparams.base_lr},
            {"params": self.model.linear.parameters(), "lr": self.hparams.linear_lr},
        ]
        optim = AdamW(grouped_parameters, lr=self.hparams.base_lr)

        # num_training_steps = (
        #     4863 // (self.hparams.batch_size * self.hparams.accumulate_grad_batches)
        # ) * self.hparams.max_epochs
        # sched = get_cosine_schedule_with_warmup(
        #     optim, num_warmup_steps=0, num_training_steps=num_training_steps
        # )

        # return [optim], [sched]
        return optim

    def training_epoch_end(self, training_step_outputs):
        y_pred = torch.sigmoid(torch.cat([out["logits"] for out in training_step_outputs])).to("cpu").detach().numpy() >= 0.5
        y_true = torch.cat([out["true_preds"] for out in training_step_outputs]).to("cpu", dtype=int).detach().numpy()
        
        acc = metrics.accuracy_score(y_pred, y_true)
        f1 = metrics.f1_score(y_pred, y_true)
        
        self.log("train_acc", acc)
        self.log("train_f1", f1)
    
    def validation_epoch_end(self, validation_step_outputs):
        y_pred = torch.sigmoid(torch.cat([out["logits"] for out in validation_step_outputs])).to("cpu").detach().numpy() >= 0.5
        y_true = torch.cat([out["true_preds"] for out in validation_step_outputs]).to("cpu", dtype=int).detach().numpy()
        
        acc = metrics.accuracy_score(y_pred, y_true)
        f1 = metrics.f1_score(y_pred, y_true)
        
        self.log("val_acc", acc)
        self.log("val_f1", f1)

In [4]:
# import numpy as np
# from sklearn import metrics

# ypred = np.random.uniform(0,1,size=(128,))
# ytrue = np.random.randint(0,2,size=(128,))

# ypred = ypred >=0.5
# print(metrics.accuracy_score(ypred,ytrue))

In [5]:
import os
from argparse import ArgumentParser

import torch
import pytorch_lightning as pl

import wandb
from pytorch_lightning.loggers import WandbLogger

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

# from dataloader import BinaryClassificationDataModule
# from model import SequenceClassicationLightningModule

class ToggleBaseTraining(pl.Callback):
    def on_train_epoch_start(self, trainer, pl_module):
        print("-" * 100)
        print("ToggleBaseTraining Callback working.............")
        if trainer.current_epoch == 0:
            print(
                f"current_epoch is: {trainer.current_epoch} and freezing BERT layer's parameters"
            )
            for p in pl_module.model.base.parameters():
                p.requires_grad = False
        else:
            print(
                f"current_epoch is: {trainer.current_epoch} and unfreezing BERT layer's parameters for training"
            )
            for p in pl_module.model.base.parameters():
                p.requires_grad = True
        print("-" * 100)

class SaveModelWeights(pl.Callback):
    def __init__(self, save_from_epoch=1):
        self.save_from_epoch =save_from_epoch

    def on_validation_end(self, trainer, pl_module):
        os.makedirs("../models/", exist_ok=True)
        print("-" * 100)
        print("SaveModelWeight Callback working.............")
        print(f"trainer.current_epoch: {trainer.current_epoch}")
        if trainer.current_epoch >= self.save_from_epoch:
            m_filepath = f"../models/{pl_module.hparams.model_name}-epoch-{trainer.current_epoch}.pt"
            torch.save(pl_module.model.state_dict(), m_filepath)
            print(f"saved current model weights in file: {m_filepath}")
        print("-" * 100)

In [6]:
pl.seed_everything(420)

parser = ArgumentParser()

# trainer related arguments
parser.add_argument(
    "--gpus",
    default=1,
    help="if value is 0 cpu will be used, if string then that gpu device will be used",
)
parser.add_argument("--checkpoint_callback", action="store_true")
parser.add_argument("--logger", action="store_true")
parser.add_argument("--max_epochs", default=5, type=int)
parser.add_argument("--progress_bar_refresh_rate", default=0, type=int)
parser.add_argument("--accumulate_grad_batches", default=2, type=int)
parser.add_argument("--model_name", default="ahsg", type=str)

# data related arguments
parser.add_argument("--batch_size", default=8, type=int)
parser.add_argument("--maxlen", default=512, type=int)

# model related arguments
parser.add_argument("--base_path", type=str, default="xlm-roberta-base")
parser.add_argument("--base_lr", default=1e-5, type=int)
parser.add_argument("--linear_lr", default=5e-3, type=int)
parser.add_argument("--base_dropout", default=0.3, type=float)
parser.add_argument(
    "--bert_output_used",
    default="maxpooled",
    type=str,
    choices=["maxpooled", "weighted_sum"],
)
parser.add_argument("--run_name", default=None)
# parser = pl.Trainer.add_argparse_args(parser)

args = parser.parse_known_args()
args = args[0]

args.effective_batch_size = args.batch_size * args.accumulate_grad_batches
args.log_every_n_steps = args.accumulate_grad_batches * 5

if not torch.cuda.is_available():
    args.gpus = 0

Global seed set to 420


In [7]:
args.gpus = 1
args.accumulate_grad_batches = 1
args.batch_size = 8
args.run_name = "test6"
args.logger = True

In [8]:
pl_model = SequenceClassicationLightningModule(args)
data = BinaryClassificationDataModule(args)

In [9]:
# wandb.login()

In [10]:
if args.logger:
    args.logger = WandbLogger(
        project="ahsg", entity='professor',
        name=args.run_name if (args.run_name is not None) else None,
    )

trainer = pl.Trainer.from_argparse_args(
    args,
    callbacks=[
        ToggleBaseTraining(),
        SaveModelWeights(save_from_epoch=0),
    ],
)

print(
f"Training model_name={args.model_name} for epochs={args.max_epochs} with an effective_batch_size={args.effective_batch_size}"
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


Training model_name=ahsg for epochs=5 with an effective_batch_size=16


In [11]:
trainer.fit(pl_model, data)

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: wandb version 0.10.21 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



  | Name  | Type      | Params
------------------------------------
0 | model | MainModel | 278 M 
------------------------------------
278 M     Trainable params
0         Non-trainable params
278 M     Total params
1,112.178 Total estimated model params size (MB)


----------------------------------------------------------------------------------------------------
SaveModelWeight Callback working.............
trainer.current_epoch: 0
saved current model weights in file: ../models/ahsg-epoch-0.pt
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
ToggleBaseTraining Callback working.............
current_epoch is: 0 and freezing BERT layer's parameters
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
SaveModelWeight Callback working.............
trainer.current_epoch: 0
saved current model weights in file: ../models/ahsg-epoch-0.pt
----------------------------------------------------------------------------------------------------
--------------------

1

In [13]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss_step,0.00045
epoch,4.0
_runtime,1737.0
_timestamp,1615246902.0
_step,3864.0
train_loss_epoch,0.06461
train_acc,0.9806
train_f1,0.96068
valid_loss,0.0838
val_acc,0.97822


0,1
train_loss_step,▇▆▇▅▆▆█▆▅▁▁▁▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
_runtime,▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss_epoch,█▂▁▁▁
train_acc,▁▇███
train_f1,▁████
valid_loss,█▁▁▁▁
val_acc,▁████
