# Project 3


##  Install Dependencies

In [1]:
!pip install -U transformers datasets evaluate accelerate sentencepiece



##  Imports & Setup

In [2]:
import os, pandas as pd, numpy as np, torch
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast,EncoderDecoderModel,DataCollatorForSeq2Seq,Trainer,TrainingArguments
import evaluate

device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

##  Load Data

In [3]:
train_df=pd.read_csv('train.csv')
val_df=pd.read_csv('validation.csv')
test_df=pd.csv('test.csv') if os.path.exists('test.csv') else val_df.copy()
train_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

## Tokenizer & Model

In [None]:
base='bert-base-uncased'
tokenizer=BertTokenizerFast.from_pretrained(base)
model=EncoderDecoderModel.from_encoder_decoder_pretrained(base,base)

model.config.decoder_start_token_id=tokenizer.cls_token_id
model.config.eos_token_id=tokenizer.sep_token_id
model.config.pad_token_id=tokenizer.pad_token_id

model.config.max_length=64
model.config.num_beams=4
model.to(device)

##  Preprocessing

In [None]:
max_input=384
max_target=64

def preprocess(batch):
    enc=tokenizer(batch['dialogue'],max_length=max_input,padding='max_length',truncation=True)
    with tokenizer.as_target_tokenizer():
        dec=tokenizer(batch['summary'],max_length=max_target,padding='max_length',truncation=True)
    labels=[]
    for seq in dec['input_ids']:
        labels.append([t if t!=tokenizer.pad_token_id else -100 for t in seq])
    enc['labels']=labels
    return enc

tok=raw.map(preprocess,batched=True,remove_columns=raw['train'].column_names)
tok

##  Data Collator

In [None]:
collator=DataCollatorForSeq2Seq(tokenizer,model=model)

## Metrics (ROUGE)

In [None]:
rouge=evaluate.load('rouge')

def metrics(pred):
    p,l=pred
    l=np.where(l!=-100,l,tokenizer.pad_token_id)
    dp=tokenizer.batch_decode(p,skip_special_tokens=True)
    dl=tokenizer.batch_decode(l,skip_special_tokens=True)
    scores=rouge.compute(predictions=[x.strip() for x in dp],references=[x.strip() for x in dl])
    return {k:round(v*100,2) for k,v in scores.items()}

##  Training Arguments

In [None]:
args=TrainingArguments(
    output_dir='./model_out',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),
    predict_with_generate=True
)

## Trainer

In [None]:
trainer=Trainer(
    model=model,
    args=args,
    train_dataset=tok['train'],
    eval_dataset=tok['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=metrics
)

trainer

## Summarization Function

In [None]:
def summarize(text):
    model.eval()
    enc=tokenizer(text,return_tensors='pt',truncation=True,padding='longest').to(device)
    with torch.no_grad():
        out=model.generate(**enc,max_length=64,num_beams=4)
    return tokenizer.decode(out[0],skip_special_tokens=True)

# Example:
# summarize(train_df.dialogue.iloc[0])