# IMPORTING NECESSARY PACKAGES

In [1]:
!pip install transformers
!pip install gradio
!pip install datasets transformers
!pip install rouge_score

Collecting gradio
  Downloading gradio-4.39.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.1-py3-none-any.whl.metadata (26 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.1.1 (from gradio)
  Downloading gradio_client-1.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m802.0 kB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting

In [2]:
import gradio as gr
from transformers import pipeline
import os
from datasets import load_metric
import zipfile
import os
import re
import pandas as pd
import torch

# MODEL FROM HF SPACE AND SUMMARIZE_EMAIL FUNCTION

In [3]:
hf_api_key = os.getenv("HUGGING_FACE_API_KEY")

In [4]:
device = 0 if torch.cuda.is_available() else -1

In [19]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
model_name = "paramasivan27/distilgpt2_for_email_summarization_enron"
# Load the summarization pipeline with your pre-trained model

# Function to summarize email
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Ensure the pad token is set
tokenizer.pad_token = tokenizer.eos_token


def summarize_email(email_body, max_new_tokens=50):
    # Prepare the input prompt
    prompt = f"Generate a subject line for the following email in less than 5 words: {email_body}\nSummary:"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True, padding=True)
    input_length = inputs['input_ids'].shape[1]

    # Adjust max_length to be a certain percentage of the input length
    min_length = 3
    adjusted_max_length = max(min_length + 10, int(input_length * 0.6))  # Ensure a minimum length

    # Generate the summary
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=7,
        min_length=min_length,
        #length_penalty=2.0,
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    # Extract the summary part
    summary = summary.replace(prompt, "").strip()
    return summary

# LOAD TEST DATA

In [6]:
# Define the path to the zip file and the extraction directory

zip_file_path = 'dev.zip'
extraction_dir = 'enron_emails'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(extraction_dir)

# List the contents of the extraction directory
extracted_files = os.listdir(extraction_dir)
extracted_files

['dev']

In [7]:
test_dir = os.path.join(extraction_dir, 'dev')

# Print the count of files in each directory
test_files = os.listdir(test_dir)
print('Test Files Count', len(test_files))

Test Files Count 1960


In [8]:
# Define a function to process each file and extract email body and subject line for DEV and TEST FILES

###### PREPROCESS ########
# Replacing multiple spaces to single space
# Remove special characters, as they do not influence subject line generation

def process_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    email_body_initial, subject_line = content.split('@subject\n', 1)
    email_body = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', email_body_initial).strip())
    subjects = subject_line.replace("@ann0", "@subject\n").replace("@ann1", "@subject\n").replace("@ann2", "@subject\n")
    subject_0, subject_1, subject_2, subject_3 = subjects.split("@subject")
    return email_body.strip(), subject_0.strip(), subject_1.strip(), subject_2.strip(), subject_3.strip()

In [9]:
# Process all TEST files and store the results in a DataFrame (Capture data in CSV for review)
fh_1 = open('test_loader.txt', 'w')

data = []
for file_name in os.listdir(test_dir):
    file_path = os.path.join(test_dir, file_name)
    fh_1.write(file_path)
    email_body, subject_line, subject_line1, subject_line2, subject_line3 = process_file(file_path)
    fh_1.write('\n')
    fh_1.write(email_body)
    fh_1.write('\n')
    fh_1.write(subject_line)
    fh_1.write('\n')
    data.append((email_body, subject_line, subject_line1, subject_line2, subject_line3))

# Create a DataFrame from the processed data
df_test = pd.DataFrame(data, columns=['email_body', 'subject_line', 'subject_line1', 'subject_line2', 'subject_line3' ])
fh_1.close()
# Display the DataFrame
print('Shape Of the TEST Dataset Dataframe :', df_test.shape)
df_test.head()

Shape Of the TEST Dataset Dataframe : (1960, 5)


Unnamed: 0,email_body,subject_line,subject_line1,subject_line2,subject_line3
0,There are several new computer viruses that ar...,Computer Virus,how to handle computer viruses that arrive in ...,warning- viruses targeting windows users,email viruses warning notice
1,Brock whats going on We need to hear back from...,Where are you ?,response needed,need status update by monday,please respond about work
2,Hi In an industry driven by forces such as eco...,Improve your margins by enhancing your IT Infr...,izhuta benefits,izuhta can deliver knowledge and insight to yo...,what manufacturers want
3,Thanksgiving is set at Moms for Thursday Novem...,Thanksgiving Day,mom & thanksgiving itinerary,thanksgiving is set,"thanksgiving is set at mom's for thursday, nov..."
4,This message is to confirm your EOL transactio...,EOL Confirmation -Transwestern Pipeline Company,eol confirmation,eol transaction confirmation,eol transaction confirmation: transwestern pi...


# CALCULATE ROUGE SCORE

In [10]:
df_rouge = df_test

In [22]:
df_rouge.drop(columns = ['generated_summary'], inplace = True)

In [23]:
df_rouge

Unnamed: 0,email_body,subject_line,subject_line1,subject_line2,subject_line3
0,There are several new computer viruses that ar...,Computer Virus,how to handle computer viruses that arrive in ...,warning- viruses targeting windows users,email viruses warning notice
1,Brock whats going on We need to hear back from...,Where are you ?,response needed,need status update by monday,please respond about work
2,Hi In an industry driven by forces such as eco...,Improve your margins by enhancing your IT Infr...,izhuta benefits,izuhta can deliver knowledge and insight to yo...,what manufacturers want
3,Thanksgiving is set at Moms for Thursday Novem...,Thanksgiving Day,mom & thanksgiving itinerary,thanksgiving is set,"thanksgiving is set at mom's for thursday, nov..."
4,This message is to confirm your EOL transactio...,EOL Confirmation -Transwestern Pipeline Company,eol confirmation,eol transaction confirmation,eol transaction confirmation: transwestern pi...
...,...,...,...,...,...
1955,Dear Advanced Seminar Speakers We presume that...,"2001 ICSC LAW CONFERENCE- PALM DESERT, CA (OCT...",advanced seminar speakers - please read and re...,law conference speakers & requirements,is this accurate?
1956,We want to personally thank you for agreeing t...,Launch of Bauer College Energy Committee - Nov...,thanks for joining the c.t. bauer college of b...,energy committee of the ct,energy committee of the c.t. member news
1957,Once again this year I am leading the BEAR BE ...,BEAR (BE A Resource for CPS Kids) Holiday Fund...,bear - kids' holiday fundraiser,fundraiser for kids,holiday fundraiser
1958,Judge McCartney presented her report on the SE...,FERC meeting: SE RTO,recommendation for se rto mediation to ferc,judge mccartney recommendation,judge mccartney recommendation information


In [24]:
# Load the ROUGE metric
rouge = load_metric("rouge")

# Generate summaries for the test dataset
def generate_summary(text):
    email_body = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).strip())
    try:
        summary = summarize_email(email_body)
        #print(summary)
    except:
        summary = "Error"
        #print(email_body)
    return summary

In [13]:
email_text = """John,   I am working with Gerald and Debra on implementing a GISB with Tenn. Gas.
Currently, we have worked out every detail.
Their legal department is reviewing our special provisions and then they are supposed to sign two copies of the agreement and send them to us for final execution.
If you have any questions with respect to this Agreement, please do not hesitate to contact me.
Best Regards,"""

response = generate_summary(email_text)
print(response)

Thanks for your help on this matter


In [None]:
df_rouge['generated_summary'] = df_rouge['email_body'].apply(generate_summary)

In [None]:
"""import concurrent.futures

def apply_multithreaded(func, df, column_name, num_threads=100):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Map the function to the column using multithreading
        results = list(executor.map(func, df[column_name]))

    # Add the results to a new column in the DataFrame
    df[f"{column_name}_summary"] = results
    return df

# Apply the function to the 'email_body' column
df_rouge = apply_multithreaded(generate_summary, df_rouge, 'email_body')

print(df_rouge)"""

In [16]:
df_rouge.to_csv('test_rouge.csv')

In [17]:
from rouge_score import rouge_scorer

df_rouge.to_csv()

emailbd = df_rouge['email_body'].tolist()
preds = df_rouge['generated_summary'].tolist()
temp = df_rouge[['subject_line', 'subject_line1', 'subject_line2', 'subject_line3']]
labels = temp.values.tolist()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Generate predictions and calculate ROUGE scores
all_rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

# Rouge expects input as list of strings for predictions and references
for i in range(len(labels)):
  for lbl in labels[i]:
    #rouge_output = rouge.compute(predictions=preds, references=lbl, use_stemmer=True)
    scores = scorer.score(preds[i], lbl)
    for key in all_rouge_scores.keys():
      all_rouge_scores[key].append(scores[key].fmeasure)

avg_rouge_scores = {key: sum(scores)/len(scores) for key, scores in all_rouge_scores.items()}
print("Average ROUGE Scores:", avg_rouge_scores)

Average ROUGE Scores: {'rouge1': 0.10658709214367193, 'rouge2': 0.041839523780108655, 'rougeL': 0.10234448377437692}


# GRADIO APP

In [18]:
# Create a Gradio interface
interface = gr.Interface(
    fn=summarize_email,
    inputs=gr.Textbox(lines=10, placeholder="Enter the email body here..."),
    outputs=gr.Textbox(),
    title="Email Subject Line Generator Using GPT2",
    description="Generate an email subject line from the email body."
)

# Launch the interface
interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://dbc888549b1b607dfa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


