In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration,BartTokenizer, BartForConditionalGeneration
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from transformers import pipeline
import pandas as pd

In [2]:
# Load the Excel file
def load_file(filename):
    df = pd.read_excel(filename, engine='openpyxl')
    df.columns = ['Client ID', 'Note Date', 'Note Comment']
    df['Note Date'] = pd.to_datetime(df['Note Date'], errors='coerce')
    return df

In [3]:
def add_ordinal_indicator(date):
    if pd.isna(date):
        return "Invalid Date"
    day = date.day
    if 4 <= day <= 20 or 24 <= day <= 30:
        suffix = "th"
    else:
        suffix = ["st", "nd", "rd"][day % 10 - 1]
    return f"Update On:{day}{suffix} {date.strftime('%b %Y')}, "

In [4]:
## some pre-processing
def preprocessing(filename):
    Client_Notes = {}
    df = load_file(filename)
    df = df.sort_values(by=['Client ID', 'Note Date'], ascending=[True, True])
    for index, row in df.iterrows():
        Client = row['Client ID']
        #Invoice = row['Invoice Number']
        # If this is the first note for this client, create a new list for them
        if Client not in Client_Notes:
            Client_Notes[Client] = ''
    
        #Siddesh - Added this check as empty note was giving summarizing issues. 
        if row['Note Comment'] is not None:
            # Format the date and prepend it to the note
            note_date = add_ordinal_indicator(row['Note Date'])
            note_with_date = note_date + row['Note Comment'].strip()
        
            if note_with_date[-1] !='.':
                note_with_date = note_with_date + '.'
            if note_with_date[0] =='.':
                note_with_date = note_with_date[1:]
        
            Client_Notes[Client] += note_with_date
    return Client_Notes

In [5]:
# Load base BART model and tokenizer using pipeline
def BART(Client_Notes):
    base_bart = pipeline("summarization", model="facebook/bart-large-cnn")
    # Load fine-tuned BART model and tokenizer using pipeline
    fine_tuned_bart = pipeline("summarization", model="./fine_tuned_bart")
    
    # Load tokenizers for preprocessing
    base_bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
    fine_bart_tokenizer = BartTokenizer.from_pretrained("./fine_tuned_bart")
    
    # Now you can generate the summary for each client and add it to the DataFrame. 
    summaries = []
    for client, notes in Client_Notes.items():
        # Generate summaries using base and fine-tuned BART models
        #base_bart_inputs = base_bart_tokenizer(notes, return_tensors='pt', max_length=1024, truncation=True, padding='max_length')
        #fine_bart_inputs = fine_bart_tokenizer(notes, return_tensors='pt', max_length=1024, truncation=True, padding='max_length')

        base_bart_summary = base_bart(notes, max_length=250, min_length=40, length_penalty=2, num_beams=6)[0]['summary_text']
        fine_bart_summary = fine_tuned_bart(notes, max_length=250, min_length=40, length_penalty=2, num_beams=6)[0]['summary_text']
        
        # Generate summaries using base and fine-tuned BART models
        #base_bart_summary = base_bart_tokenizer.decode(base_bart.model.generate(base_bart_inputs['input_ids'], max_length=250, min_length=40, length_penalty=2, num_beams=6)[0], skip_special_tokens=True)
        #fine_bart_summary = fine_bart_tokenizer.decode(fine_tuned_bart.model.generate(fine_bart_inputs['input_ids'], max_length=250, min_length=40, length_penalty=2, num_beams=6)[0], skip_special_tokens=True)
        
        summaries.append({'Client ID': client, 'Note':notes,
                      'BART Base Summary': base_bart_summary
                      , 'BART Tuned Summary':fine_bart_summary
                     })
    
    # Create DataFrame after loop
    summary_df = pd.DataFrame(summaries)
    return summary_df

In [6]:
Client_Notes = preprocessing("SidNotes.xlsx")

In [7]:
#Client_Notes

In [8]:
summary_df=BART(Client_Notes)

Your max_length is set to 250, but your input_length is only 217. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=108)
Your max_length is set to 250, but your input_length is only 217. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=108)
Your max_length is set to 250, but your input_length is only 249. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=124)
Your max_length is set to 250, but your input_length is only 249. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1

In [9]:
summary_df

Unnamed: 0,Client ID,Note,BART Base Summary,BART Tuned Summary
0,1,"Update On:1st Jan 2023, Per update from John S...",Payment is expected in monthly installments of...,"As per update from AP lead, payment was expect..."
1,2,"Update On:20th Jan 2024, Sent initial follow-u...",XYZ Billing team asked for clarification on $...,An initial follow-up was sent to XYZ Billing t...
2,3,"Update On:7th Jun 2023, 1st Automated reminder...",Byteware is in financial distress. Payment pla...,A series of automated reminders was sent to By...
3,4,"Update On:10th Dec 2023, Invoices stuck in pas...","Update On:10th Dec 2023, Invoices stuck in pas...",Invoices were stuck in past due queue for a lo...
4,5,"Update On:22nd Apr 2023, Payment received from...","Update On:22nd Apr 2023, Payment received from...",Payment was received from blanket Ltd for 5255...
5,6,"Update On:22nd Nov 2022, 1st automated reminde...",1st automated reminder sent to john.smith@myco...,An automated reminder was sent to john.smith@m...
6,7,"Update On:25th Jan 2024, Sent email to Maggie ...",Invoice has crossed 180 days past due and rese...,An email was sent to Maggie Green (EM) regardi...
7,8,"Update On:12th Jul 2023, Client has requested ...","Update On:28th Jul 2023, fresh client statemen...",A comprehensive list of open invoices has been...
8,9,"Update On:11th Sep 2023, Pending open invoices...",Pending open invoices list sent to Engagement ...,An open open invoices list was sent to Engagem...
9,10,"Update On:1st Oct 2022, 1st automated reminder...","Update On:1st Oct 2022, 1st automated reminder...",An automated reminder was sent to jaypee@jpgro...


In [10]:
with pd.ExcelWriter('Summary_Compare.xlsx', engine='openpyxl', mode='w') as writer:
    summary_df.to_excel(writer, sheet_name='SidNotes', index=False)

In [11]:
Client_Notes = preprocessing("ShahulNotes.xlsx")
summary_df=BART(Client_Notes)

Your max_length is set to 250, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 250, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 250, but your input_length is only 126. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 250, but your input_length is only 126. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Yo

In [12]:
with pd.ExcelWriter('Summary_Compare.xlsx', engine='openpyxl', mode='a') as writer:
    summary_df.to_excel(writer, sheet_name='ShahulNotes', index=False)

In [13]:
Client_Notes = preprocessing("AbhiNotes.xlsx")
summary_df=BART(Client_Notes)
with pd.ExcelWriter('Summary_Compare.xlsx', engine='openpyxl', mode='a') as writer:
    summary_df.to_excel(writer, sheet_name='AbhiNotes', index=False)

Your max_length is set to 250, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max_length is set to 250, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max_length is set to 250, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 250, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your