In [None]:
!pip install rouge-score

In [1]:
import pandas as pd
from sympy import im
import torch
from datasets import Dataset
from rouge_score import rouge_scorer
from transformers import Trainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM




In [2]:
tokenizer = AutoTokenizer.from_pretrained("NTUYG/SOTitle-Gen-T5", use_fast=True, legacy = False)
SOTitle_model = AutoModelForSeq2SeqLM.from_pretrained("NTUYG/SOTitle-Gen-T5")
SOTitle_model.to("cuda")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [3]:
input_path = './Dataset/'
print(f"Loading data from augmented train file")
df_train = pd.read_csv(f"{input_path}augmented_train_data.csv")
dataset_train = Dataset.from_pandas(df_train)

print(f"Loading data from: valid file")
df_valid = pd.read_csv(f"{input_path}processed_valid_data.csv")
dataset_valid = Dataset.from_pandas(df_valid)

Loading data from augmented train file
Loading data from: valid file


In [4]:
target_max_len = 64
num_epochs = 3
output_dir = "/kaggle/working/"

In [11]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(pred.strip().split(". ")) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split(". ")) for label in decoded_labels]

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(pred, ref) for pred, ref in zip(decoded_preds, decoded_labels)]

    rouge1_f1 = sum(s["rouge1"].fmeasure for s in scores) / len(scores)
    rouge2_f1 = sum(s["rouge2"].fmeasure for s in scores) / len(scores)
    rougeL_f1 = sum(s["rougeL"].fmeasure for s in scores) / len(scores)

    return {
        "rouge1": rouge1_f1,
        "rouge2": rouge2_f1,
        "rougeL": rougeL_f1,
    }

In [5]:
def tokenize_valid(example):
    model_input = tokenizer(
        example["input"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    labels = tokenizer(
        example["title"],
        truncation=True,
        padding="max_length",
        max_length=target_max_len
    )
    model_input["labels"] = labels["input_ids"]
    return model_input


In [8]:
def tokenize_train(example):
    model_input = tokenizer(
        example["input"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    labels = tokenizer(
        example["good_title"],
        truncation=True,
        padding="max_length",
        max_length=target_max_len
    )
    model_input["labels"] = labels["input_ids"]
    return model_input


In [9]:
# Tokenize and remove the original string columns
tokenized_dataset_train = dataset_train.map(tokenize_train)

tokenized_dataset_valid = dataset_valid.map(tokenize_valid)

Map:   0%|          | 0/239996 [00:00<?, ? examples/s]

Map:   0%|          | 0/24294 [00:00<?, ? examples/s]

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=14,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    generation_max_length=64,
    save_strategy="steps",
    eval_strategy="no",
    save_total_limit=2,
    save_steps = 3500,
    logging_steps=100,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    learning_rate=5e-5,
    report_to="none"
)


trainer = Trainer(
    model=SOTitle_model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



  trainer = Trainer(


In [13]:
trainer.train()

KeyboardInterrupt: 

In [None]:
trainer.save_model("/kaggle/working/final_model")