# IMPORTING NECESSARY PACKAGES

In [1]:
!pip install transformers
!pip install gradio
!pip install datasets transformers
!pip install rouge_score

Collecting gradio
  Downloading gradio-4.38.1-py3-none-any.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting altair<6.0,>=5.0 (from gradio)
  Downloading altair-5.3.0-py3-none-any.whl (857 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m857.8/857.8 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi (from gradio)
  Downloading fastapi-0.111.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.1.0 (from gradio)
  Downloading gradio_client-1.1.0-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [5]:
import gradio as gr
from transformers import pipeline
import os
from datasets import load_metric
import zipfile
import os
import re
import pandas as pd
import torch

# MODEL FROM HF SPACE AND SUMMARIZE_EMAIL FUNCTION

In [3]:
hf_api_key = os.getenv("HUGGING_FACE_API_KEY")

In [6]:
device = 0 if torch.cuda.is_available() else -1

In [37]:
from transformers import pipeline

# Load the summarization pipeline with your pre-trained model
pipe = pipeline("summarization", model="paramasivan27/bart_for_email_summarization_enron")
# Function to summarize email
def summarize_email(email_body):
    # Tokenize the input text
    pipeline = pipe
    input_tokens = pipeline.tokenizer(email_body, return_tensors='pt', truncation=False)
    input_length = input_tokens['input_ids'].shape[1]

    # Adjust max_length to be a certain percentage of the input length
    adjusted_max_length = max(10, int(input_length * 0.6))  # Ensure a minimum length

    # Generate summary with dynamic max_length
    gen_kwargs = {
        "length_penalty": 0.5,
        "num_beams": 5,
        "max_length": adjusted_max_length,
        "min_length": 3
    }

    summary = pipeline(email_body, **gen_kwargs)[0]['summary_text']
    return summary

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


# LOAD TEST DATA

In [9]:
# Define the path to the zip file and the extraction directory

zip_file_path = 'test.zip'
extraction_dir = 'enron_emails'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(extraction_dir)

# List the contents of the extraction directory
extracted_files = os.listdir(extraction_dir)
extracted_files

['test']

In [10]:
test_dir = os.path.join(extraction_dir, 'test')

# Print the count of files in each directory
test_files = os.listdir(test_dir)
print('Test Files Count', len(test_files))

Test Files Count 1906


In [11]:
# Define a function to process each file and extract email body and subject line for DEV and TEST FILES

###### PREPROCESS ########
# Replacing multiple spaces to single space
# Remove special characters, as they do not influence subject line generation

def process_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    email_body_initial, subject_line = content.split('@subject\n', 1)
    email_body = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', email_body_initial).strip())
    subjects = subject_line.replace("@ann0", "@subject\n").replace("@ann1", "@subject\n").replace("@ann2", "@subject\n")
    subject_0, subject_1, subject_2, subject_3 = subjects.split("@subject")
    return email_body.strip(), subject_0.strip(), subject_1.strip(), subject_2.strip(), subject_3.strip()

In [12]:
# Process all TEST files and store the results in a DataFrame (Capture data in CSV for review)
fh_1 = open('test_loader.txt', 'w')

data = []
for file_name in os.listdir(test_dir):
    file_path = os.path.join(test_dir, file_name)
    fh_1.write(file_path)
    email_body, subject_line, subject_line1, subject_line2, subject_line3 = process_file(file_path)
    fh_1.write('\n')
    fh_1.write(email_body)
    fh_1.write('\n')
    fh_1.write(subject_line)
    fh_1.write('\n')
    data.append((email_body, subject_line, subject_line1, subject_line2, subject_line3))

# Create a DataFrame from the processed data
df_test = pd.DataFrame(data, columns=['email_body', 'subject_line', 'subject_line1', 'subject_line2', 'subject_line3' ])
fh_1.close()
# Display the DataFrame
print('Shape Of the TEST Dataset Dataframe :', df_test.shape)
df_test.head()

Shape Of the TEST Dataset Dataframe : (1906, 5)


Unnamed: 0,email_body,subject_line,subject_line1,subject_line2,subject_line3
0,John I am working with Gerald and Debra on imp...,Tennessee Gas Pipeling GISB,gisb with tenn. gas status,implementing gisb with tenn gas final execution,re: gisb with tenn. gas
1,At Richard Sanders request we ask that you fre...,Request for Freeze on E-Mail Destruction,freeze destruction of electronic mail,bid/ask litigation request,please freeze destruction of e-mails
2,Derek Could you call me sometime this week so ...,Annex A Letter,call me for annex a,request for contact regarding the annex a mail...,please call regarding annex a mail-out
3,This is a reminder that the ectenroncomeienron...,UPDATE: Supported Internet Email Addresses,reminder on update of email domains effective ...,deadline: email domains will be no longer avai...,email domains to be decommissioned
4,Ginny Please see the attached guaranty A clean...,Enron Corp. Guaranty,attached: guaranty for approval,please approve guaranty,attached guaranty.


# CALCULATE ROUGE SCORE

In [13]:
df_rouge = df_test

In [33]:
# Load the ROUGE metric
rouge = load_metric("rouge")

# Generate summaries for the test dataset
def generate_summary(text):
    email_body = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).strip())
    try:
        summary = summarize_email(email_body)
        #print(summary)
    except:
        summary = "Error"
        #print(email_body)
    return summary

In [38]:
email_text = """John,   I am working with Gerald and Debra on implementing a GISB with Tenn. Gas.
Currently, we have worked out every detail.
Their legal department is reviewing our special provisions and then they are supposed to sign two copies of the agreement and send them to us for final execution.
If you have any questions with respect to this Agreement, please do not hesitate to contact me.
Best Regards,"""

response = generate_summary(email_text)
print(response)

Tenn Gas GISB


In [None]:
df_rouge['generated_summary'] = df_rouge['email_body'].apply(generate_summary)

In [None]:
"""import concurrent.futures

def apply_multithreaded(func, df, column_name, num_threads=100):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Map the function to the column using multithreading
        results = list(executor.map(func, df[column_name]))

    # Add the results to a new column in the DataFrame
    df[f"{column_name}_summary"] = results
    return df

# Apply the function to the 'email_body' column
df_rouge = apply_multithreaded(generate_summary, df_rouge, 'email_body')

print(df_rouge)"""

In [18]:
df_rouge

Unnamed: 0,email_body,subject_line,subject_line1,subject_line2,subject_line3
0,Please put the 17th meeting on my calendar I m...,Net Works Floor Meetings - DATES AND LOCATIONS...,instructions for the meeting on the 17th,i need meeting on 17th added to my calendar,calendar meeting for 17th
1,Its been long enough since I spent any time in...,Dead Horses,investment strategy anecdotes,dakota indian wisdom re: a dead horse,dead horse strategies
2,Rebecca I share some of your concerns regardin...,Mr. Sud,concerns,regarding your concerns,mr. sud update and clarification
3,Ruben S Brown rsbrownecubedllccom writes to th...,Try Ananova.com for much coverage of today's t...,nyiso_tech_exchange discussion,today's terrorist incidents?,updates from ruben s. brown on how to get info...
4,Just a note that all Michcon deals need to be ...,Michcon Deals,michcon deal entry,michon deals protocol,note: michcon entries
...,...,...,...,...,...
1901,The EnPower database will be down from 4 PM Fr...,EnPower outage,database will be down,enpower database downtime,down databases
1902,Marcus TD has scheduled a conference call with...,TD crude swap,conference call re: crude swap happening this ...,conference call on crude swap,conference call request - crude swap
1903,Paul Please replace both references to the Sou...,Arbitration Language,arbitration provision reference changes,references for arbitration provision,arbitration provision changes
1904,Since our department has grown we have realize...,New Temp IDs,request for addition of 3 temporary ids,new temporary ids,additional temp id access rights


In [14]:
from rouge_score import rouge_scorer

df_rouge.to_csv()

emailbd = df_rouge['email_body'].tolist()
preds = df_rouge['generated_summary'].tolist()
temp = df_rouge[['subject_line', 'subject_line1', 'subject_line2', 'subject_line3']]
labels = temp.values.tolist()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Generate predictions and calculate ROUGE scores
all_rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

# Rouge expects input as list of strings for predictions and references
for i in range(len(labels)):
  for lbl in labels[i]:
    #rouge_output = rouge.compute(predictions=preds, references=lbl, use_stemmer=True)
    scores = scorer.score(preds[i], lbl)
    for key in all_rouge_scores.keys():
      all_rouge_scores[key].append(scores[key].fmeasure)

avg_rouge_scores = {key: sum(scores)/len(scores) for key, scores in all_rouge_scores.items()}
print("Average ROUGE Scores:", avg_rouge_scores)

Average ROUGE Scores: {'rouge1': 0.29717840660543454, 'rouge2': 0.14851952028411372, 'rougeL': 0.2847933249136446}


# GRADIO APP

In [None]:
# Create a Gradio interface
interface = gr.Interface(
    fn=summarize_email,
    inputs=gr.Textbox(lines=10, placeholder="Enter the email body here..."),
    outputs=gr.Textbox(),
    title="Email Subject Line Generator Using BART",
    description="Generate an email subject line from the email body."
)

# Launch the interface
interface.launch()