In [1]:
import pandas as pd
from datasets import Dataset
import numpy as np
from transformers import Trainer, TrainingArguments
import nltk
from datasets import load_metric
from transformers import BartTokenizer, BartForConditionalGeneration
import evaluate

# Load your dataset
df = pd.read_excel('Summary_training.xlsx')
df.columns = ['notes', 'summary']

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']


In [2]:
#Check GPU Availability
import torch

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU")

Using GPU: NVIDIA RTX A1000 6GB Laptop GPU


In [3]:
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

406,290,432 total parameters.
406,290,432 training parameters.


In [4]:
def preprocess_data(tokenizer, examples):
    inputs = [doc for doc in examples['notes']]
    model_inputs = tokenizer(
        inputs,
        max_length=1024,
        truncation=True,
        padding='max_length'
    )
    
    targets = [summary for summary in examples['summary']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=1024,
            truncation=True,
            padding='max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Apply the function to the whole dataset
train_dataset = train_dataset.map(lambda x: preprocess_data(tokenizer,x), batched=True)
val_dataset = val_dataset.map(lambda x: preprocess_data(tokenizer,x), batched=True)

Map:   0%|          | 0/29 [00:00<?, ? examples/s]



Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [5]:
rouge = evaluate.load("rouge")

In [6]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        use_stemmer=True, 
        rouge_types=[
            'rouge1', 
            'rouge2', 
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [7]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,    
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=5,
    save_strategy='steps',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=2e-5,
    fp16=True if torch.cuda.is_available() else False,
    load_best_model_at_end=True
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

trainer.train()
model.save_pretrained('./fine_tuned_bart')
tokenizer.save_pretrained('./fine_tuned_bart')


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
5,No log,11.163314,0.2299,0.109,0.1925,1024.0
10,11.041400,10.243776,0.1943,0.1091,0.1759,1024.0
15,11.041400,9.763458,0.4147,0.2279,0.3831,1024.0
20,9.458500,8.691725,0.4721,0.2717,0.4358,1024.0
25,9.458500,7.145017,0.3198,0.1923,0.2973,1024.0
30,7.007300,5.682806,0.206,0.1244,0.1814,845.5
35,7.007300,2.999268,0.4843,0.258,0.4385,289.75
40,3.476200,1.685207,0.6015,0.334,0.5505,66.75
45,3.476200,1.041592,0.6378,0.3339,0.569,59.75
50,1.459900,0.70601,0.6415,0.3356,0.5688,57.0


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./fine_tuned_bart\\tokenizer_config.json',
 './fine_tuned_bart\\special_tokens_map.json',
 './fine_tuned_bart\\vocab.json',
 './fine_tuned_bart\\merges.txt',
 './fine_tuned_bart\\added_tokens.json')

In [10]:
# Evaluate the models
eval_results_bart = trainer.evaluate(eval_dataset=val_dataset)
print("BART ROUGE scores:", eval_results_bart)


BART ROUGE scores: {'eval_loss': 0.08623351901769638, 'eval_rouge1': 0.7527, 'eval_rouge2': 0.5789, 'eval_rougeL': 0.7265, 'eval_gen_len': 55.5, 'eval_runtime': 4.3203, 'eval_samples_per_second': 0.926, 'eval_steps_per_second': 0.926, 'epoch': 5.0}


### ok lets test now

In [18]:
from transformers import pipeline

# Load base BART model and tokenizer using pipeline
base_bart = pipeline("summarization", model="facebook/bart-large-cnn")

# Load fine-tuned BART model and tokenizer using pipeline
fine_tuned_bart = pipeline("summarization", model="./fine_tuned_bart")

# Load tokenizers for preprocessing
base_bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
fine_bart_tokenizer = BartTokenizer.from_pretrained("./fine_tuned_bart")

# Example input text (use one of the notes from your dataset)
input_text = df['notes'][0]


base_bart_inputs = base_bart_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True, padding='max_length')
fine_bart_inputs = fine_bart_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True, padding='max_length')

# Generate summaries using base and fine-tuned BART models
base_bart_summary = base_bart_tokenizer.decode(base_bart.model.generate(base_bart_inputs['input_ids'], max_length=250, min_length=40, length_penalty=2, num_beams=6)[0], skip_special_tokens=True)
fine_bart_summary = fine_bart_tokenizer.decode(fine_tuned_bart.model.generate(fine_bart_inputs['input_ids'], max_length=250, min_length=40, length_penalty=2, num_beams=6)[0], skip_special_tokens=True)

#base_bart_summary = base_bart(input_text, max_length=250, min_length=40, length_penalty=2, num_beams=6)[0]['summary_text']
#fine_bart_summary = fine_tuned_bart(input_text, max_length=250, min_length=40, length_penalty=2, num_beams=6)[0]['summary_text']

print("Input Text:")
print(input_text)
print("\nBase BART Summary:")
print(base_bart_summary)
print("\nFine-Tuned BART Summary:")
print(fine_bart_summary)


Input Text:
Update On:1st Jan 2023, Per update from John Smith (AP lead ABC Corp), payment is expected in monthly installments of 100k USD for all current open invoices starting Feb.Update On:5th Feb 2023, First installment of payment is received and applied against oldest invoices. Further payment is awaited.Update On:3rd Mar 2023, Connected with John to get update on payment. The next installment is expected in a week as per update from John.Update On:13th Mar 2023, Next installment of 100k received and applied.Update On:28th Mar 2023, Received email from John regarding inability to make April payment. Further updates on next installment is awaited.Update On:15th Apr 2023, As of 14th April, 20 open invoices are awaiting payment total 500k. Made contact with Jim (EP) and Jim will be following up directly with client.Update On:28th Apr 2023, Lump sum settlment has been made and applied to all open AR.

Base BART Summary:
Payment is expected in monthly installments of 100k USD for all c

In [19]:
# Example input text (use one of the notes from your dataset)
input_text = df['notes'][1]


base_bart_inputs = base_bart_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True, padding='max_length')
fine_bart_inputs = fine_bart_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True, padding='max_length')

# Generate summaries using base and fine-tuned BART models
base_bart_summary = base_bart_tokenizer.decode(base_bart.model.generate(base_bart_inputs['input_ids'], max_length=250, min_length=40, length_penalty=2, num_beams=6)[0], skip_special_tokens=True)
fine_bart_summary = fine_bart_tokenizer.decode(fine_tuned_bart.model.generate(fine_bart_inputs['input_ids'], max_length=250, min_length=40, length_penalty=2, num_beams=6)[0], skip_special_tokens=True)

#base_bart_summary = base_bart(input_text, max_length=250, min_length=40, length_penalty=2, num_beams=6)[0]['summary_text']
#fine_bart_summary = fine_tuned_bart(input_text, max_length=250, min_length=40, length_penalty=2, num_beams=6)[0]['summary_text']

print("Input Text:")
print(input_text)
print("\nBase BART Summary:")
print(base_bart_summary)
print("\nFine-Tuned BART Summary:")
print(fine_bart_summary)


Input Text:
Update On:20th Jan 2024, Sent initial follow-up for payments for Invoices AC123 and AC345.Update On:23rd Jan 2024, Got response from XYZ Billing team asking for clarification on $500 sales tax amount on AC123. Forwarded correspondance to Billing Team to get breakdown.Update On:25th Jan 2024, Mike from the engagement billing team has provided breakdown and the same has been forwarded to XYZ Billing team.Update On:1st Feb 2024, XYZ Billing team has sought a call to further discuss the sales tax discrepancy and get confirmation on similar amounts in other open AR. Meeting has been setup for later this week.Update On:8th Feb 2024, Follow-up meeting will be setup for 3rd week of Feb once the provided data has been analyzed by XYZ team.Update On:21st Feb 2024, XYZ would like to dispute the sales tax and review all open services with our firm. Per discussion with Will (EP), no further correspondance will be conducted with XYZ by the collections team.Update On:28th Feb 2024, All op