In [None]:
import pandas as pd

import nltk
import bitsandbytes
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
from datasets import load_dataset
import datasets
from transformers import AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import numpy as np
import evaluate
import argparse
import torch
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight
from huggingface_hub import notebook_login

notebook_login()

In [62]:
dataset = load_dataset("webis/tldr-17", split="train")

Loading dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

In [16]:
dataset = dataset.train_test_split(test_size=0.05)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

In [19]:
random_train = dataset["train"].shuffle(seed=42).select(range(57000))
random_test = dataset["test"].shuffle(seed=42).select(range(3000))

In [20]:
def tokenize_function(examples):
        model_inputs = tokenizer(examples["content"], padding = "max_length", max_length = 1024, truncation=True)
        summary = tokenizer(examples["summary"], padding = "max_length", max_length=128, truncation=True)
        model_inputs["labels"] = summary["input_ids"]
        return model_inputs

In [46]:
tokenized_train = random_train.map(tokenize_function, batched=True)
tokenized_test = random_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/57000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [39]:

train_dataset = tokenized_train
eval_dataset = tokenized_test

In [23]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [44]:
batch_size = 8
num_train_epochs = 3

logging_steps = len(train_dataset) // batch_size
args = Seq2SeqTrainingArguments(
    output_dir=f"model-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
    fp16 = True,
    optim="adamw_bnb_8bit"
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    print(len(decoded_preds))
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    print(len(decoded_labels))

    rouge_score = evaluate.load("rouge")
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    print(result)

    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.8208,0.776124,18.3902,4.6113,12.8442,15.9072
2,0.6505,0.769532,19.1849,4.752,13.3464,16.3018
3,0.505,0.810305,19.6531,4.8361,13.4817,16.6192


3000
3000
{'rouge1': 0.18390176782902212, 'rouge2': 0.04611326010842858, 'rougeL': 0.12844173074895218, 'rougeLsum': 0.15907184861506635}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


3000
3000
{'rouge1': 0.1918488203521185, 'rouge2': 0.0475196026742221, 'rougeL': 0.133464237662845, 'rougeLsum': 0.16301767691545285}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


3000
3000
{'rouge1': 0.1965313532970688, 'rouge2': 0.048361289818950645, 'rougeL': 0.13481723150904343, 'rougeLsum': 0.16619222030081854}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=21375, training_loss=0.6587483438413743, metrics={'train_runtime': 7896.6342, 'train_samples_per_second': 21.655, 'train_steps_per_second': 2.707, 'total_flos': 3.70574886961152e+17, 'train_loss': 0.6587483438413743, 'epoch': 3.0})

KeyboardInterrupt: 

In [50]:
prefix = "summarize: "

def tokenize_function(examples):
        inputs = [prefix + doc for doc in examples["content"]]
        model_inputs = tokenizer(inputs, padding = "max_length", max_length = 1024, truncation=True)
        summary = tokenizer(examples["summary"], padding = "max_length", max_length=128, truncation=True)
        model_inputs["labels"] = summary["input_ids"]
        return model_inputs
tokenized_train2 = random_train.map(tokenize_function, batched=True)
tokenized_test2 = random_test.map(tokenize_function, batched=True)
train_dataset2 = tokenized_train2
eval_dataset2 = tokenized_test2


In [48]:
model2 = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [51]:
trainer2 = Seq2SeqTrainer(
    model2,
    args,
    train_dataset=train_dataset2,
    eval_dataset=eval_dataset2,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [52]:
trainer2.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.8429,0.782388,18.4715,4.6313,12.8681,15.9758
2,0.6513,0.780022,19.2755,4.833,13.4657,16.4393
3,0.503,0.814108,19.604,4.8762,13.5315,16.5944


3000
3000
{'rouge1': 0.18471521175154731, 'rouge2': 0.04631281791468153, 'rougeL': 0.1286806585940681, 'rougeLsum': 0.15975771420342438}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


3000
3000
{'rouge1': 0.19275484327180303, 'rouge2': 0.04833010766598442, 'rougeL': 0.13465747684714113, 'rougeLsum': 0.16439328759059743}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


3000
3000
{'rouge1': 0.196039686238137, 'rouge2': 0.04876182366111146, 'rougeL': 0.13531530441942363, 'rougeLsum': 0.16594364695668987}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=21375, training_loss=0.6657534608004386, metrics={'train_runtime': 7793.0011, 'train_samples_per_second': 21.943, 'train_steps_per_second': 2.743, 'total_flos': 3.70574886961152e+17, 'train_loss': 0.6657534608004386, 'epoch': 3.0})

In [56]:
trainer2.save_model("model-finetuned/model")

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [57]:
model3 = AutoModelForSeq2SeqLM.from_pretrained("model-finetuned/model")

In [58]:
args2 = Seq2SeqTrainingArguments(
    output_dir=f"model-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
    fp16 = True,
    optim="adamw_bnb_8bit"
)
trainer3 = Seq2SeqTrainer(
    model3,
    args2,
    train_dataset=train_dataset2,
    eval_dataset=eval_dataset2,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [60]:
trainer3.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5293,0.855843,19.2296,4.5876,13.2364,16.2947
2,0.3776,0.926419,19.1121,4.4726,13.085,15.9981


3000
3000
{'rouge1': 0.1922958398402801, 'rouge2': 0.04587564051326931, 'rougeL': 0.13236371328938676, 'rougeLsum': 0.1629465546703732}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


3000
3000
{'rouge1': 0.19112113906984807, 'rouge2': 0.04472595201899306, 'rougeL': 0.13085040117710223, 'rougeLsum': 0.15998112413427662}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=14250, training_loss=0.4534212410910088, metrics={'train_runtime': 5240.5962, 'train_samples_per_second': 21.753, 'train_steps_per_second': 2.719, 'total_flos': 2.640053510476923e+17, 'train_loss': 0.4534212410910088, 'epoch': 2.0})

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [66]:
df = pd.read_csv("drive/MyDrive/Data/all_cleaned_summarization_data.csv")

In [100]:
email = df.loc[15, "text"]
email = "Summarize: " + email
print(email)



In [70]:
tokenizer = AutoTokenizer.from_pretrained("model-finetuned/checkpoint-21375")
model = AutoModelForSeq2SeqLM.from_pretrained("model-finetuned/checkpoint-21375")

In [101]:
input = tokenizer(email, padding="max_length", truncation=True, max_length = 1024, return_tensors="pt")

In [102]:
outputs = model.generate(**input)

generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [103]:
print(email)
print(generated_summaries)

['amazon.com, \n \n [USER]    your package has arrived!   it was great [LINK] \n[SEP] [USER>   track your package:   [SCREENSHOTS](   you can order again from: \n order-update@amazon.com  \n <SEP> [USER] [SEP]" false']


Summarize: hi [USER],    i’m checking in on my previous email to you about what is new.    can i set up a call for you?    thank you and i hope you have a wonderful morning,    jessica gold  patron experience assistant  92nd street y  1395 lexington avenue  new york, ny 10128  92ny.org      unsubscribe   [SEP] jessica gold [SEP] jgold@cmail.92y.org [SEP] <USER> [SEP] Updates [SEP] False


TypeError: argument 'ids': Can't extract `str` to `Vec`