# 1 set up basics

In [None]:
!pip install torch
!pip install transformers datasets accelerate peft sentencepiece

In [2]:
import pandas as pd

df=pd.read_csv('All_capped_keywords.csv')  
# len(df) there are 91k rows

In [3]:
subset = df.loc[91000:91900, ["abstract", "title"]] # take an untouched subset for experimentation
subset.to_csv("abstract_title.csv", index=False)

# Fine-Tune

In [4]:
import os, random
import pandas as pd
import torch
from datasets import Dataset
from transformers import (T5ForConditionalGeneration, T5TokenizerFast,
                          DataCollatorForSeq2Seq, Trainer, TrainingArguments)


# Load data
df = pd.read_csv("abstract_title.csv")  
df = df.dropna(subset=["abstract","title"]).sample(frac=1.0, random_state=42).reset_index(drop=True)

# Small dev split
n = len(df)
split = int(0.9 * n) # 90% train, 10% eval
train_df = df.iloc[:split] # first 90%
eval_df  = df.iloc[split:] # last 10%

train_ds = Dataset.from_pandas(train_df) 
eval_ds  = Dataset.from_pandas(eval_df)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Model & tokenizer
model_name = "t5-small" # small model for lesser compute power
tok = T5TokenizerFast.from_pretrained(model_name) # tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_name) # model
model.to("mps")  # because I am running on apple silicon

# for task framing
PREFIX = "summarize Abstract: "

# shortening for compute
max_src_len = 512   
max_tgt_len = 48    

def preprocess(batch): # tokenize the inputs and labels
    inputs = [PREFIX + a for a in batch["abstract"]]  
    targets = batch["title"] 
    model_inputs = tok(inputs, max_length=max_src_len, truncation=True) # tokenize inputs
    with tok.as_target_tokenizer(): # tokenize targets
        labels = tok(targets, max_length=max_tgt_len, truncation=True)
    model_inputs["labels"] = labels["input_ids"] # set as labels
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
eval_tok  = eval_ds.map(preprocess, batched=True, remove_columns=eval_ds.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model) 

Map: 100%|██████████| 806/806 [00:00<00:00, 6884.06 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 6019.67 examples/s]


In [6]:
# Training args
args = TrainingArguments(
    output_dir="t5_title_ft", 
    num_train_epochs=3,         
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,  # effective batch approx 16
    learning_rate= 3e-4,
    weight_decay=0.01,
    logging_steps=50,  
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
    tokenizer=tok,
)

trainer.train()
trainer.save_model("t5_title_ft/final")
tok.save_pretrained("t5_title_ft/final")

  trainer = Trainer(


Step,Training Loss
50,2.8906
100,2.3659
150,2.1971


('t5_title_ft/final/tokenizer_config.json',
 't5_title_ft/final/special_tokens_map.json',
 't5_title_ft/final/spiece.model',
 't5_title_ft/final/added_tokens.json',
 't5_title_ft/final/tokenizer.json')

In [7]:
# Inference: generate titles for new abstracts
def generate_title(abstract: str, max_new_tokens=32, num_beams=4):
    inp = tok(PREFIX + abstract, return_tensors="pt", truncation=True, max_length=max_src_len) 
    inp = {k:v.to("mps") for k,v in inp.items()} 
    with torch.no_grad():
        out = model.generate(**inp, max_new_tokens=max_new_tokens, num_beams=num_beams, length_penalty=0.8)
    return tok.decode(out[0], skip_special_tokens=True)

# demo
for i in range(min(5, len(eval_df))):
    abs_ = eval_df.iloc[i]["abstract"]
    ref  = eval_df.iloc[i]["title"]
    pred = generate_title(abs_)
    print(f"\nAbstract: {abs_[:200]}...")
    print(f"True Title:  {ref}")
    print(f"Prediction Title: {pred}")


Abstract: Automatic taxonomy construction aims to build a categorization system without human efforts. Traditional textual pattern based methods extract hyponymy relation in raw texts. However, these methods us...
True Title:  Coarse to Fine: Diffusing Categories in Wikipedia
Prediction Title: Using diffusing Attributes from Wikipedia Infoboxes

Abstract: This paper looks into the use of Information and Communication Technology (ICT) for Smart Sustainable Cities (SSC). It specifically points towards ICT's potential to help cities mitigate climate chang...
True Title:  Reflections Regarding ICT and a Citizen-centric Future Path of Smart Sustainable Cities: AW4City 2018 Keynote
Prediction Title: ICT for Smart Sustainable Cities

Abstract: Ranking algorithms play a crucial role in online platforms ranging from search engines to recommender systems. In this paper, we identify a surprising consequence of popularity-based rankings: the few...
True Title:  The few-get-richer: a surprising co