In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth',None)

In [2]:
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wanda-api")

wandb.login(key=my_secret)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
MODEL = 't5-base'
BATCH_SIZE = 3
EPOCHS = 6
OUT_DIR = 't5_base_incomplete_questions_with_keywords'
MAX_SOURCE_LENGTH = 600
MAX_TARGET_LENGTH = 128
LEARNING_RATE = 2e-4

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained(f"google-t5/{MODEL}")
model = T5ForConditionalGeneration.from_pretrained(f"google-t5/{MODEL}")
model.to('cuda')

Script for Dataset Preparation: https://colab.research.google.com/drive/15tOGetOxq3dfOuLP1A9ys4DBfLBy84uq?usp=sharing

In [88]:
train_df = pd.read_csv('/kaggle/input/incomplete-questions/final_train_set.csv')
val_df = pd.read_csv('/kaggle/input/incomplete-questions/final_validation_set.csv')

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_df.shape, val_df.shape

  train_df = pd.read_csv('/kaggle/input/incomplete-questions/final_train_set.csv')


((168646, 5), (16381, 5))

In [89]:
# filter only long questions
ans_threshold = 6

train_condition = [len(str(a).split(" ")) >= ans_threshold for a in train_df['answer']]
train_df = train_df[train_condition]

val_condition = [len(str(a).split(" ")) >= ans_threshold for a in val_df['answer']]
val_df = val_df[val_condition]

train_df.shape, val_df.shape

((47698, 5), (4227, 5))

In [87]:
# # filter only short context
# con_threshold = 550

# train_con_condition = [len(str(con).split(" ")) / 0.75 <= con_threshold for con in train_df['context']]
# train_df = train_df[train_con_condition]

# val_con_condition = [len(str(con).split(" ")) / 0.75 <= con_threshold for con in val_df['context']]
# val_df = val_df[val_con_condition]

# train_df.shape, val_df.shape

((37363, 5), (3238, 5))

In [90]:
train_df.shape

(47698, 5)

In [75]:
train_df = train_df.sample(frac=1, ignore_index=True)
val_df = val_df.sample(frac=1, ignore_index=True)

In [None]:
prefix = "make question:"
def preprocess_data(dataset, tokenizer):
    prompts = [f"{prefix} answer: {answer}, key: {key}, context: {context}" for answer, key, context in zip(dataset['answer'], dataset['incomplete_question'], dataset['context'])]
    desired_output = list(dataset['question'])

    inputs = tokenizer(
        text=prompts,
        max_length=MAX_SOURCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    labels = tokenizer(
        text_target=desired_output,
        max_length=MAX_TARGET_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels['input_ids'],
        'decoder_attention_mask': labels['attention_mask']
    }

In [None]:
train_set = preprocess_data(train_df, tokenizer)
val_set = preprocess_data(val_df, tokenizer)

In [None]:
import datasets
train_dataset = datasets.Dataset.from_dict(train_set)
val_dataset = datasets.Dataset.from_dict(val_set)

In [None]:
train_dataset.shape, val_dataset.shape

# Set up Training

In [None]:
import torch

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids

In [None]:
from nltk.translate import bleu_score

def compute_metrics(eval_pred): # eval_preds: tuple(preds, labels)
    pred_ids, labels_ids = eval_pred
    
    # if use preprocess_logits_for_metrics, don't need the below code
#     # logits: tuple(preds, inputs)
#     if isinstance(logits, tuple):
#         logits = logits[0] # get preds only
#     pred_ids = np.argmax(logits, axis=-1)
    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    bleu = bleu_score.corpus_bleu(list_of_references=[[label] for label in label_str], hypotheses=pred_str)
    return {"bleu": bleu}

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=3,
    save_steps=1000,
    eval_steps=1000
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
trainer.train()

# Eval