In [1]:
#Copy data to collab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import shutil

In [3]:
destination_path = '/content/emailsubjects'
source_path = '/content/drive/My Drive/emailsubjects/'

# Copy the file
shutil.copytree(source_path, destination_path,dirs_exist_ok=True)

'/content/emailsubjects'

In [4]:
import pandas as pd

In [76]:
test_df = pd.read_csv('emailsubjects/enron_subject_test.csv')
training_df = pd.read_csv('emailsubjects/enron_subject_train.csv')

In [77]:
test_df = test_df[['text', 'subject']]
training_df = training_df[['text', 'subject']]

In [70]:
import numpy as np

In [78]:
train_df, validate_df = np.split(training_df.sample(frac=1), [int(.8*len(training_df))])

  return bound(*args, **kwds)


In [79]:
# train_df = train_df.head(100)
# valid_df = train_df.tail(20)
# test_df = test_df.head(20)

In [80]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

In [81]:
class EmailTextDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer  # Tokenizer for encoding text
        self.data = data            # Data containing email boy and subject
        self.max_length = max_length # Maximum length of tokens

    # Return the number of items in the dataset
    def __len__(self):
        return len(self.data)

    # Retrieve an item from the dataset by index
    def __getitem__(self, idx):
        item = self.data.iloc[idx]  # Get the row at the specified index
        text = item['text'] # Extract text from the row
        subject = item['subject']   # Extract summary from the row

        # Encode the text as input data for the model
        source = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

        # Encode the subject as target data for the model
        target = self.tokenizer.encode_plus(
            subject,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

        # Return a dictionary containing input_ids, attention_mask, labels, and the original summary text
        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten(),
            'summary': subject
        }

In [82]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Initialize the tokenizer for BART
# 'facebook/bart-base' is a pretrained model identifier
# The tokenizer is responsible for converting text input into tokens that the model can understand
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Initialize the BART model for conditional generation
# This model is used for tasks like summarization where the output is conditional on the input text
# The model is loaded with pretrained weights from 'facebook/bart-base'
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [83]:
train_dataset = EmailTextDataset(tokenizer, train_df)
valid_dataset = EmailTextDataset(tokenizer, validate_df)

In [91]:
test_dataset = EmailTextDataset(tokenizer, test_df)

In [86]:
from transformers import TrainingArguments

# Define training arguments for the model
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save model output and checkpoints
    num_train_epochs=3,              # Number of epochs to train the model
    per_device_train_batch_size=8,   # Batch size per device during training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=100,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Weight decay for regularization
    logging_dir='./logs',            # Directory to save logs
    logging_steps=10,                # Log metrics every specified number of steps
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch
    report_to='none'                 # Disables reporting to any online services (e.g., TensorBoard, WandB)
)



In [87]:
# Initializing the Trainer object
trainer = Trainer(
    model=model,             # The model to be trained (e.g., our BART model)
    args=training_args,      # Training arguments specifying training parameters like learning rate, batch size, etc.
    train_dataset=train_dataset,  # The dataset to be used for training the model
    eval_dataset=valid_dataset    # The dataset to be used for evaluating the model during training
)

# Starting the training process
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0482,0.042735
2,0.0343,0.040799
3,0.0316,0.040619


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=4332, training_loss=0.16953836290796984, metrics={'train_runtime': 5975.3186, 'train_samples_per_second': 5.798, 'train_steps_per_second': 0.725, 'total_flos': 1.056185492963328e+16, 'train_loss': 0.16953836290796984, 'epoch': 3.0})

In [14]:
#!pip install datasets

In [88]:
from datasets import load_metric
from torch.utils.data import DataLoader

# Load the ROUGE metric for evaluation
rouge = load_metric('rouge')

In [89]:
def generate_subjects(model, tokenizer, dataset, batch_size=8):
    """
    Generate email subject using the provided model and tokenizer on the given dataset.

    Args:
        model: The trained summarization model.
        tokenizer: Tokenizer associated with the model.
        dataset: Dataset for which subject need to be generated.
        batch_size: Number of data samples to process in each batch.

    Returns:
        subjects: Generated subject by the model.
        references: Actual subjects from the dataset for comparison.
    """
    # Set model to evaluation mode
    model.eval()
    subjects = []    # List to store generated subjects
    references = []   # List to store actual subjects

    # Create a DataLoader for batch processing
    dataloader = DataLoader(dataset, batch_size=batch_size)

    # Disabled gradient calculations for efficiency
    with torch.no_grad():
        for batch in dataloader:
            # Move input data to the same device as the model
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)

            # Generate subjects with the model
            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=2048, num_beams=2)
            batch_subjects = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]

            # Append generated and actual subjects to the respective lists
            subjects.extend(batch_subjects)
            references.extend(batch['summary'])

    return subjects, references

In [92]:
# Generate summaries for the validation dataset
generated_subjects, actual_subjects = generate_subjects(model, tokenizer, test_dataset, batch_size=8)

In [93]:
generated_subjects[:10]

['Title Change',
 'Expense Reports Awaiting Your Approval',
 'Netco Re-Start/Integration Plans',
 'Comments',
 'PSA distribution',
 'BCP Seats',
 'Happy New Year!',
 'EOL slide',
 'Analyst & Associate Recruiting Dates',
 'Enron Center Security']

In [96]:
actual_subjects[:10]

['Huntley/question',
 'Expense Reports Awaiting Your Approval',
 'Re-start/Integration Planning',
 'RM Simulation Storyline Scripts - Ready for Legal Review',
 'Answer',
 'BCP Seat Assignments',
 'Farewell Drinks',
 'Slide for John Sheriff',
 'URGENT - ENA Associates & Analysts',
 'Increased Security at Enron Center']

In [102]:
# Check if CUDA (GPU support) is available and choose the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the chosen device
model = model.to(device)

In [None]:
def generate_subjects(model, tokenizer, dataset, batch_size=8):
    """
    Generate email subject using the provided model and tokenizer on the given dataset.

    Args:
        model: The trained summarization model.
        tokenizer: Tokenizer associated with the model.
        dataset: Dataset for which subject need to be generated.
        batch_size: Number of data samples to process in each batch.

    Returns:
        subjects: Generated subject by the model.
        references: Actual subjects from the dataset for comparison.
    """
    # Set model to evaluation mode
    model.eval()
    subjects = []    # List to store generated subjects
    references = []   # List to store actual subjects

    # Create a DataLoader for batch processing
    dataloader = DataLoader(dataset, batch_size=batch_size)

    # Disabled gradient calculations for efficiency
    with torch.no_grad():
        for batch in dataloader:
            # Move input data to the same device as the model
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)

            # Generate subjects with the model
            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=2048, num_beams=2)
            batch_subjects = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]

            # Append generated and actual subjects to the respective lists
            subjects.extend(batch_subjects)
            references.extend(batch['summary'])

    return subjects, references

In [103]:
def summarize_text(text, max_length=500):
    """
    Generates a email subject for the given text using a pre-trained model.

    Args:
        text (str): Email text to generate subject.
        max_length (int): The maximum length of the input text for the model.

    Returns:
        str: The generated subject for the input text.
    """
    # Encode the input text using the tokenizer. The 'pt' indicates PyTorch tensors.
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=max_length, truncation=False)

    # Move the encoded text to the same device as the model (e.g., GPU or CPU)
    inputs = inputs.to(device)

    # Generate summary IDs with the model. num_beams controls the beam search width.
    # early_stopping is set to False for a thorough search, though it can be set to True for faster results.
    summary_ids = model.generate(inputs, max_length=500, num_beams=30, early_stopping=False)

    # Decode the generated IDs back to text, skipping special tokens like padding or EOS.
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Return the generated summary
    return summary

In [97]:
# Compute and print the ROUGE score for evaluation
rouge_score = rouge.compute(predictions=generated_subjects, references=actual_subjects)
print(rouge_score)

{'rouge1': AggregateScore(low=Score(precision=0.37378795114760516, recall=0.3137390080917105, fmeasure=0.31843537088455953), mid=Score(precision=0.39026023563995793, recall=0.32869717070267956, fmeasure=0.3329765689211063), high=Score(precision=0.40727632711450296, recall=0.34410358718464684, fmeasure=0.34785449779108746)), 'rouge2': AggregateScore(low=Score(precision=0.19850224143580988, recall=0.16490013840945092, fmeasure=0.1671922049182846), mid=Score(precision=0.21470265519597276, recall=0.17895533331656094, fmeasure=0.18117176993669765), high=Score(precision=0.2310647995320187, recall=0.19373139459642572, fmeasure=0.19576138713575938)), 'rougeL': AggregateScore(low=Score(precision=0.3660595191359886, recall=0.30668777379493517, fmeasure=0.31133579804112144), mid=Score(precision=0.3835591535519398, recall=0.32336507712294665, fmeasure=0.3271782691805899), high=Score(precision=0.4003484573605243, recall=0.3378312598777578, fmeasure=0.34135193338128006)), 'rougeLsum': AggregateScore

In [98]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [100]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [105]:
model.push_to_hub("email_bart_1", token = hf_token) # Online saving
tokenizer.push_to_hub("email_bart_1", token = hf_token) # Online saving

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sbtraining2020/email_bart_1/commit/ac5b1b518c9f4080f8c01714ad00f0814104f000', commit_message='Upload tokenizer', commit_description='', oid='ac5b1b518c9f4080f8c01714ad00f0814104f000', pr_url=None, pr_revision=None, pr_num=None)

In [50]:
test_df.iloc[3].text

'Michelle Here are my very minor comments.\nHowever we still need to wait on any additions, based on meeting with SME\'s today.\nOne concern is the firing of the learner who performs  bad in the final two scenarios.\nDo we face any copyright issues using the CNN type themes?\nIn addition, I think we need to stay clear of anything that remotely seems like California or anything that really happen with Enron?\n(i.e.\nSo-cal Waha) In addition, comments on regulatory issues may be a problem (i.e.\nCalifornia Legislature).\nSheri  When you read all the scripts together and due to the similar mechanics being taught it appears very repetitious.\nThus I do believe we need to maybe use a "Dateline" type theme for one, and a "60 Minute" type theme for another scenario vice just the CNN type theme.\nIn the last two scenarios can we include a promotion out of the associate program for the stellar performers (i.e.\ntitle change to manager)?\nCheers Kirk'

In [54]:
test_df.iloc[3].subject

'RM Simulation Storyline Scripts - Ready for Legal Review'

In [104]:
# Prompt the user to enter text for summarization
text = """
Michelle Here are my very minor comments.
However we still need to wait on any additions, based on meeting with SME's today.
One concern is the firing of the learner who performs  bad in the final two scenarios.
Do we face any copyright issues using the CNN type themes?
In addition, I think we need to stay clear of anything that remotely seems like California or anything that really happen with Enron?
(i.e.So-cal Waha) In addition, comments on regulatory issues may be a problem (i.e.California Legislature).
Sheri  When you read all the scripts together and due to the similar mechanics being taught it appears very repetitious.
Thus I do believe we need to maybe use a "Dateline" type theme for one, and a "60 Minute" type theme for another scenario vice just the CNN type theme.
In the last two scenarios can we include a promotion out of the associate program for the stellar performers (i.e.title change to manager)?
Cheers Kirk
"""


# Call the summarize_text function to generate a summary of the input text
summary = summarize_text(text)

# Print the generated summary
print(summary)

Comments from SME's


In [56]:
#!pip install rouge_score

In [61]:
test_gen_subjects = []
test_actual_subjects = []

for i, row in test_df.iterrows():
    text = row['text']
    summary = row['subject']
    test_gen_subjects.append(summarize_text(text))
    test_actual_subjects.append(summary)

In [62]:
test_rouge_score = rouge.compute(predictions=test_gen_subjects, references=test_actual_subjects)
print(test_rouge_score)

{'rouge1': AggregateScore(low=Score(precision=0.18275694444444446, recall=0.13328067765567767, fmeasure=0.1478169575522517), mid=Score(precision=0.3309126984126984, recall=0.2541208791208791, fmeasure=0.26425393234216765), high=Score(precision=0.5080575396825396, recall=0.39881581959706963, fmeasure=0.4008960892049126)), 'rouge2': AggregateScore(low=Score(precision=0.04166666666666667, recall=0.032916666666667045, fmeasure=0.028482142857142942), mid=Score(precision=0.15833333333333335, recall=0.13333333333333333, fmeasure=0.13690476190476192), high=Score(precision=0.31687499999999985, recall=0.2833333333333333, fmeasure=0.28217261904761903)), 'rougeL': AggregateScore(low=Score(precision=0.17664087301587306, recall=0.13506639194139194, fmeasure=0.14514783990519284), mid=Score(precision=0.33331349206349203, recall=0.25934065934065936, fmeasure=0.26551282051282055), high=Score(precision=0.49573015873015874, recall=0.40001488095238086, fmeasure=0.4039565826330531)), 'rougeLsum': AggregateS

In [63]:
test_gen_subjects

['updated title policy',
 'Expense Reports Awaiting Your Approval',
 'Netco Re-start/Integration Plans',
 'Minor comments',
 'Increasing the PSA account balance',
 'BCP Seats',
 'We wish to toast the good times and special memories that we have shared with you over the past five years.',
 "Let's Go For A Ride",
 'ENA Recruiting dates',
 'An Inbound Message For You Has Been Quarantined',
 'CORRECTION: Welcome to Enron',
 'Netco Meeting',
 'STEAG',
 'ConEd bid documents for PJM East (Lakewood, New Jersey)',
 'URGENT REQUIRES',
 'CORRECTION: ENRON STOCK FUND',
 'Employee Agreement',
 'Important Information from UBS Wallingford',
 'Benefit Golf Tournament',
 'CORRECTION:  Outlook Outage - Friday, December 28, 2001']

In [64]:
test_actual_subjects

['Huntley/question',
 'Expense Reports Awaiting Your Approval',
 'Re-start/Integration Planning',
 'RM Simulation Storyline Scripts - Ready for Legal Review',
 'Answer',
 'BCP Seat Assignments',
 'Farewell Drinks',
 'Slide for John Sheriff',
 'URGENT - ENA Associates & Analysts',
 'Increased Security at Enron Center',
 'Greetings',
 'Meeting on Wednesday, January 2 - 9:30 am',
 'STEAG Power LLC',
 'ConEd - Lakewood Peaker',
 'Relocation',
 'IMPORTANT MESSAGE TO ALL EMPLOYEES PARTICIPATING IN THE ENRON CORP  SAVINGS & ESOP PLANS',
 'UBS issues',
 'YOU HAVE 48 HOURS - Message from Louise Kitchen',
 'Sunshine Kids Benefit Golf Tournament - March 18th',
 'NOTIFICATION:  Outlook Outage - Friday, December 28, 2001']