In [1]:
TRAIN_PATH = '/kaggle/input/quest-with-5gkf/train_with_GKF.csv'
TEST_PATH = '/kaggle/input/google-quest-challenge/test.csv'
SAMPSUB_PATH = '/kaggle/input/google-quest-challenge/sample_submission.csv'
SUB_PATH = '/kaggle/working/submission.csv'
CKPT_PATH = '/kaggle/input/questnb-outputs/kaggle-infer2-epoch-04-val_spearman-0.38628863974944133-fold-0.ckpt'
BERT_PATH = '/kaggle/input/questnb-outputs/'

In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from argparse import ArgumentParser

import torch
import torch.nn as nn
import transformers
import pytorch_lightning as pl
from transformers import AdamW, get_cosine_schedule_with_warmup

from scipy.stats import spearmanr

from tqdm.notebook import tqdm

In [3]:
import wandb
from pytorch_lightning.loggers import WandbLogger

from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

In [4]:
# ROOT_DIR = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

class qDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        args,
        qtitle,
        qbody,
        answer,
        target=None,
    ):
        self.hparams = args
        self.qtitle = qtitle
        self.qbody = qbody
        self.answer = answer
        if target is None:
            self.target = [0] * len(self.qtitle)
        else:
            self.target = target
        self.tokenizer = transformers.BertTokenizer.from_pretrained(BERT_PATH)
        self.maxlen = self.hparams.maxlen

    def __len__(self):
        return len(self.qtitle)

    def __getitem__(self, idx):
        # print("idx: ", idx)
        qtitle = self.qtitle[idx]
        qbody = self.qbody[idx]
        answer = self.answer[idx]

        inputs = self.tokenizer(
            " ".join(qtitle.split()) + " " + " ".join(qbody.split()),
            " ".join(answer.split()),
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.maxlen,
        )
        return {
            "ids_seq": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "attn_masks": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            "target": torch.tensor(self.target[idx], dtype=torch.float),
        }


TARGET_COLUMNS= [
    "question_asker_intent_understanding",
    "question_body_critical",
    "question_conversational",
    "question_expect_short_answer",
    "question_fact_seeking",
    "question_has_commonly_accepted_answer",
    "question_interestingness_others",
    "question_interestingness_self",
    "question_multi_intent",
    "question_not_really_a_question",
    "question_opinion_seeking",
    "question_type_choice",
    "question_type_compare",
    "question_type_consequence",
    "question_type_definition",
    "question_type_entity",
    "question_type_instructions",
    "question_type_procedure",
    "question_type_reason_explanation",
    "question_type_spelling",
    "question_well_written",
    "answer_helpful",
    "answer_level_of_information",
    "answer_plausible",
    "answer_relevance",
    "answer_satisfaction",
    "answer_type_instructions",
    "answer_type_procedure",
    "answer_type_reason_explanation",
    "answer_well_written",
]


class QuestData(pl.LightningDataModule):
    
    def __init__(self, args, target_cols=TARGET_COLUMNS):
        super().__init__()
        self.hparams = args
        self.target_cols = target_cols
    
    def train_dataloader(self):
        df = pd.read_csv(TRAIN_PATH).fillna("none")
        df = df.loc[df["fold"] != self.hparams.fold]
        
        qtitle = df.loc[:, "question_title"].values
        qbody = df.loc[:, "question_body"].values
        answer = df.loc[:, "answer"].values
        target = df.loc[:, self.target_cols].values

        ds = qDataset(self.hparams ,qtitle, qbody, answer, target)

        return torch.utils.data.DataLoader(
            ds,
            batch_size=self.hparams.batch_size,
            shuffle=True,
            num_workers=8,
            drop_last=True,
        )
    
    def val_dataloader(self):
        df = pd.read_csv(TRAIN_PATH).fillna("none")
        df = df.loc[df["fold"] == self.hparams.fold]
        
        qtitle = df.loc[:, "question_title"].values
        qbody = df.loc[:, "question_body"].values
        answer = df.loc[:, "answer"].values
        target = df.loc[:, self.target_cols].values

        ds = qDataset(self.hparams ,qtitle, qbody, answer, target)

        return torch.utils.data.DataLoader(
            ds,
            batch_size=self.hparams.batch_size*2,
            shuffle=False,
            num_workers=8,
            drop_last=True,
        )
    
    def test_dataloader(self):
        df = pd.read_csv(TEST_PATH).fillna("none")
        
        qtitle = df.loc[:, "question_title"].values
        qbody = df.loc[:, "question_body"].values
        answer = df.loc[:, "answer"].values
        
        ds = qDataset(self.hparams ,qtitle, qbody, answer)
        
        return torch.utils.data.DataLoader(
            ds,
            batch_size=self.hparams.batch_size,
            shuffle=False,
            num_workers=8,
            drop_last=False,
        )

In [5]:
class QuestModel(pl.LightningModule):
    def __init__(self, args, **kwargs):
        super().__init__()
        self.save_hyperparameters(args)
        
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_dropout = nn.Dropout(self.hparams.bert_dropout)
        self.linear = nn.Linear(768, 30)

    @staticmethod
    def loss(logits, targets):
        return nn.BCEWithLogitsLoss()(logits, targets)

    def forward(self, ids_seq, attn_masks, token_type_ids):

        bert_out = self.bert(
            ids_seq, attention_mask=attn_masks, token_type_ids=token_type_ids
        )
        # using maxpooled output
        max_out = self.bert_dropout(bert_out[1])
        return self.linear(max_out)

    def shared_step(self, batch):
        ids_seq, attn_masks, token_type_ids, target = (
            batch["ids_seq"],
            batch["attn_masks"],
            batch["token_type_ids"],
            batch["target"],
        )
        logits = self(ids_seq, attn_masks, token_type_ids)
        loss = self.loss(logits, target)
        return logits, loss

    def training_step(self, batch, batch_idx):
        logits, loss = self.shared_step(batch)
        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=False, logger=True
        )
        return loss

    def validation_step(self, batch, batch_idx):
        logits, loss = self.shared_step(batch)
        self.log(
            "valid_loss", loss, on_step=False, on_epoch=True, prog_bar=False, logger=True
        )
        return {"valid_loss": loss, "logits": logits, "true_preds": batch["target"]}
    
    def test_step(self, batch, batch_idx):
        ids_seq, attn_masks, token_type_ids = (
            batch["ids_seq"],
            batch["attn_masks"],
            batch["token_type_ids"],
        )

        logits = self(ids_seq, attn_masks, token_type_ids)
        return logits

    def configure_optimizers(self):
        grouped_parameters = [
            {"params":self.bert.parameters(), "lr":self.hparams.bert_lr},
            {"params":self.linear.parameters(), "lr":self.hparams.linear_lr}
        ]
        optim = AdamW(grouped_parameters, lr=self.hparams.bert_lr)
        # 4863 is total number of samples in train split
        num_training_steps = (4863//(self.hparams.batch_size*self.hparams.accumulate_grad_batches))*self.hparams.max_epochs
        sched = get_cosine_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=num_training_steps)
        
        return [optim] , [sched]

    def validation_epoch_end(
        self, validation_step_outputs
    ):  
        y_pred = (
            torch.sigmoid(torch.cat([out['logits'] for out in validation_step_outputs]))
            .to("cpu")
            .detach()
            .numpy()
        )
        y_true = (
            torch.cat([out['true_preds'] for out in validation_step_outputs]).to("cpu").detach().numpy()
        )

        spearman_corr = self.spearman_metric(y_true, y_pred)
        print("-"*50, f"\nval_spearman: {spearman_corr}\n", "-"*50)
        self.log("val_spearman", spearman_corr, logger=True)
        
    def test_epoch_end(self, test_outputs):
        test_outputs = torch.sigmoid(torch.cat(test_outputs)).to("cpu").detach().numpy()

        submission_df = pd.read_csv(SAMPSUB_PATH)
        submission_df.loc[:, TARGET_COLUMNS] = test_outputs

        submission_df.to_csv(
            SUB_PATH,
            index=False,
        )
        print(f"predictions saved in file {SUB_PATH}")

    @staticmethod
    def spearman_metric(y_true, y_pred, return_scores=False):
        corr = [
            spearmanr(pred_col, target_col).correlation
            for pred_col, target_col in zip(y_pred.T, y_true.T)
        ]
        if return_scores:
            return corr
        else:
            return np.nanmean(corr)

In [6]:
pl.seed_everything(420)

parser = ArgumentParser()
parser.add_argument("--ckpt_folder", default="models", type=str)
parser.add_argument(
    "--output_filename", default="none", type=str, help="regex pattern or filename"
)
parser.add_argument("--fold", default=0, type=int, choices=[0, 1, 2, 3, 4])
parser.add_argument(
    "--gpus",
    default=1,
    help="if value is 0 cpu will be used, if string then that gpu device will be used",
)
parser.add_argument("--maxlen", default=512, type=int)
parser.add_argument("--bert_lr", default=1e-5, type=int)
parser.add_argument("--linear_lr", default=5e-3, type=int)
parser.add_argument("--bert_dropout", default=0.3, type=float)
parser.add_argument(
    "--bert_output_used",
    default="maxpooled",
    type=str,
    choices=["maxpooled", "weighted_sum"],
)
parser.add_argument("--batch_size", default=8, type=int)
parser.add_argument("--max_epochs", default=5, type=int)
parser.add_argument("--accumulate_grad_batches", default=2, type=int)
parser.add_argument("--model_name", default="quest", type=str)

# parser = pl.Trainer.add_argparse_args(parser)
args = parser.parse_known_args()                                                                      #parser.parse_args()
args = args[0]

In [7]:
# experimenting with different values
# on kaggle use batch_size 16 and accumulate_grad_batches 1, uses complete resources (gpu memory = 16gb)

args.fold = 0
args.maxlen = 512
args.batch_size=16
args.max_epochs = 5
args.accumulate_grad_batches = 1
args.model_name = "kaggle-infer3"

args.effective_batch_size = args.batch_size * args.accumulate_grad_batches
args.log_every_n_steps = args.accumulate_grad_batches * 5

In [8]:
class ToggleBertBaseTraining(pl.Callback):
    def on_train_epoch_start(self, trainer, pl_module):
        if trainer.current_epoch == 0:
            print(
                f"current_epoch is: {trainer.current_epoch} and freezing BERT layer's parameters"
            )
            for p in pl_module.bert.parameters():
                p.requires_grad = False
        else:
            print(
                f"current_epoch is: {trainer.current_epoch} and unfreezing BERT layer's parameters for training"
            )
            for p in pl_module.bert.parameters():
                p.requires_grad = True

In [9]:
model = QuestModel(args)
data = QuestData(args)


trainer = pl.Trainer.from_argparse_args(args)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [10]:
# print(
#     f"Training model_name={args.model_name} on fold={args.fold} for max_apochs={args.max_epochs} with and effective batch_size of effective_batch_size={args.effective_batch_size}"
# )
# trainer.fit(model, data)

print(f"Loading model weights using pretrained weights from checkpoint file: {CKPT_PATH}")
model = model.load_from_checkpoint(CKPT_PATH)

print("Starting the test loop.........")
trainer.test(model, data.test_dataloader())

Loading model weights using pretrained weights from checkpoint file: /kaggle/input/questnb-outputs/kaggle-infer2-epoch-04-val_spearman-0.38628863974944133-fold-0.ckpt
Starting the test loop.........


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

predictions saved in file /kaggle/working/submission.csv
--------------------------------------------------------------------------------



1

In [11]:
wandb.finish()

In [12]:
!rm -rf wandb lightning_logs models

In [13]:
# device = torch.device("cpu")
# model.to(device)
# model.eval()

In [14]:
# outputs = []
# for batch in tqdm(data.test_dataloader()):
#     ids_seq, attn_masks, token_type_ids, target = (
#         batch["ids_seq"].to(device),
#         batch["attn_masks"].to(device),
#         batch["token_type_ids"].to(device),
#         batch["target"],
#     )

#     logits = model(ids_seq, attn_masks, token_type_ids)
#     outputs.append(logits)

# outputs = torch.sigmoid(torch.cat(outputs)).to("cpu").detach().numpy()

# submission_df = pd.read_csv(SAMPSUB_PATH)
# submission_df.loc[:, TARGET_COLUMNS] = outputs

# submission_df.to_csv(
#     SUB_PATH,
#     index=False,
# )
# print(f"predictions saved in file {SUB_PATH}")

In [15]:
# import os
# os.listdir('models')

In [16]:
# ! git clone https://github.com/arch-raven/google-quest-challenge.git

In [17]:
# import os
# os.chdir("google-quest-challenge")
# !git checkout 
# !pwd
# !ls

In [18]:
# os.listdir('src')

In [19]:
# ! python src/predict

In [20]:
# ! python src/main.py --fold 1

In [21]:
# ! python src/main.py --fold 2

In [22]:
# ! python src/main.py --fold 3

In [23]:
# ! python src/main.py --fold 4