In [8]:
%pip install transformers datasets torch scikit-learn accelerate

Note: you may need to restart the kernel to use updated packages.


In [9]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('LightTai/personalized-email')

In [10]:
print(dataset)
print(dataset['train'][:5])
df = dataset['train'].to_pandas()
print(df.head())
print(df.shape)

DatasetDict({
    train: Dataset({
        features: ['product', 'gender', 'profession', 'hobby', 'email'],
        num_rows: 30
    })
})
{'product': ['piano lessons', 'guitar lessons', 'vacation plans', 'vacation plans', 'vacation plans'], 'gender': ['male', 'male', 'male', 'female', 'female'], 'profession': ['college students', 'college students', 'college students', 'college students', 'company employees'], 'hobby': ['like to play piano', 'like to play piano', 'like swimming', 'like to look at the scenery', 'like to look at the scenery'], 'email': ["Subject: Elevate Your Piano Skills - Exclusive Offer Inside!\n\nHey [Name],\n\nLooking to unlock your piano potential? As a fellow male college student and a passionate piano player, I understand your love for music. That's why I'm thrilled to offer you exclusive piano lessons designed to fit your busy student schedule.\n\nMaster your favorite melodies, refine techniques, and gain a deeper understanding of music theory-all while enjoyin

In [11]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset

model_checkpoint = "postbot/distilgpt2-emailgen-V2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize_function(examples):
    features = [f"{prod} {gen} {prof} {hob}" 
                for prod, gen, prof, hob in zip(examples["product"], 
                                                examples["gender"], 
                                                examples["profession"], 
                                                examples["hobby"])]
    tokenized_inputs = tokenizer(features, truncation=True, padding="max_length", max_length=512)

    # Tokenize the email column which is our target
    tokenized_targets = tokenizer(examples["email"], truncation=True, padding="max_length", max_length=512)

    tokenized_inputs['labels'] = tokenized_targets['input_ids']  # Assign target token ids as labels for training
    return tokenized_inputs


# Assuming 'df' is your DataFrame and it's already loaded
train_df, test_df = train_test_split(df, test_size=0.2)

# Convert the DataFrames back to Hugging Face dataset format if necessary
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["product", "gender", "profession", "hobby", "email"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["product", "gender", "profession", "hobby", "email"])
# Assuming 'dataset' is a Hugging Face 'datasets' object
# tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["product", "gender", "profession", "hobby", "email"])


Map (num_proc=4):   0%|          | 0/24 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6 [00:00<?, ? examples/s]

In [12]:
print(tokenized_train_dataset)

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 24
})


In [13]:
tokenized_train_dataset[1]

{'__index_level_0__': 22,
 'input_ids': [6966,
  27757,
  4048,
  4409,
  3504,
  12,
  1886,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  5025

In [14]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir="./model_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Save the model and tokenizer after training is complete
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 6.779111862182617, 'eval_runtime': 30.5605, 'eval_samples_per_second': 0.196, 'eval_steps_per_second': 0.065, 'epoch': 1.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.766538143157959, 'eval_runtime': 49.2908, 'eval_samples_per_second': 0.122, 'eval_steps_per_second': 0.041, 'epoch': 2.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.4993414878845215, 'eval_runtime': 52.3809, 'eval_samples_per_second': 0.115, 'eval_steps_per_second': 0.038, 'epoch': 3.0}
{'train_runtime': 1192.2375, 'train_samples_per_second': 0.06, 'train_steps_per_second': 0.015, 'train_loss': 5.714724646674262, 'epoch': 3.0}


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [15]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
print(eval_results)

  0%|          | 0/2 [00:00<?, ?it/s]

Perplexity: 89.96
{'eval_loss': 4.4993414878845215, 'eval_runtime': 32.2455, 'eval_samples_per_second': 0.186, 'eval_steps_per_second': 0.062, 'epoch': 3.0}


In [22]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, set_seed
import re

# Set a random seed for reproducible results
set_seed(42)

# Load the model and tokenizer
model_checkpoint = "./saved_model"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, padding_side='left')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# Set up the pipeline using the freshly trained model and tokenizer
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

def clean_generated_text(text):
    # Enhanced cleanup: remove redundant phrases and unwanted tokens
    text = re.sub(r'\b(URL)\b\s*', '', text)  # Remove 'URL' placeholders
    text = re.sub(r'\b(\d{1,2}[:]\d{2}\s*(AM|PM))\b', '', text)  # Remove standalone time
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation-like numbers
    text = re.sub(r"NUMBER|EMAIL|PHONE|FAX", "", text)
    text = re.sub(r"\[.*?\]|-\s*-\s*|<.*?>", "", text)  # Clean up brackets and dashed lines
    text = re.sub(r'\s{2,}', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

def generate_email(input_text, use_pipeline=False):
    if use_pipeline:
        # Adjust generation settings: lower temperature, increase repetition penalty
        result = generator(input_text, max_length=256, do_sample=True, top_k=50, temperature=0.5, repetition_penalty=1.5, truncation=True)
        generated_text = result[0]['generated_text']
    else:
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        outputs = model.generate(
            **inputs,
            max_length=612,
            temperature=0.5,
            top_k=40,
            top_p=0.9,
            no_repeat_ngram_size=3,
            repetition_penalty=1.5
        )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned_text = clean_generated_text(generated_text)
    return cleaned_text

# Generate text using the pipeline
email_text1 = generate_email("Hello, Following up on the bubblegum shipment.", use_pipeline=True)
email_text2 = generate_email("Please confirm the delivery date for our next order.", use_pipeline=True)
email_text3 = generate_email("Can you update me on the status of the invoice?", use_pipeline=True)

# Print the generated emails
print(email_text1)
print("--------------------")
print("--------------------")
print("--------------------")
print(email_text2)
print("--------------------")
print("--------------------")
print("--------------------")
print(email_text3)
print("--------------------")
print("--------------------")
print("--------------------")

Hello, Following up on the bubblegum shipment.
I'll send you a note about this week to let us know if there is anything we can do/could be helpful in our efforts (e.g., printing out samples of these and preparing other materials for your label).
Thanks!
Janejane Fallin | Vice President & Head Of Sales | COMPANY Pictures Television
( | | * W. Washington Blvd #Culver City, CA * : | * <mailto:
Attachments:
x ( Bytes)
ATT.htm ("");<warsetTicketInfo%%
X-MS-HasBought=EstateContent div.MBAtuO;ATTn=writes-all;margin-left:px;}div.EmailStyle {mso-style-type:personal;font-family:"Calibri","sans-serif";color:#CC}span.BalloonTextChar [{ color:#!important; font-size:px!important; line-height:px
--------------------
--------------------
--------------------
Please confirm the delivery date for our next order.
If you have any questions, please contact us at or call -Email. Thank you for your cooperation.
Sincerely,
Customer Service Department
Check your order and more: This email was sent from a notif

# V2 Pipeline Output

Hello, Following up on the bubblegum shipment.
I'll send you the info.
Hope you are well.
Best,
Janeh.
Janeh.
On Tue, Feb ,  at : AM, Cavanaugh, Kristin < wrote:Hi Janeh,I hope you are well.I spoke to Tom Rothman and he said he'd give me a call regarding your order, I told him I wanted to talk to you about your order and the order and I said I would send you a separate email that was so that I could communicate the order.I don't know if you had a chance to look at it and he suggested I send it to him directly. I've enclosed the order and my contact info for your reference.Thank you.Best,KristinErnest,Thank you for purchasing your itemPlease note that it has arrived in your carrier.
The items listed in this shipment are:x x x inch-wide x inch-wide x inch-wide x inch-wide x inch-wide x inch-wide
--------------------
--------------------
--------------------
Please confirm the delivery date for our next order.
If you have any questions, please contact the Global Service Desk or your Local IT Representative.
Global Service Desk Contact Information:
North America: 
US Toll Free: -SPE-SONY
Europe: ()-International Toll Fee Numbers
OnNet: - or -

GSD Live Chat
Regards,
SPE Identity Management
MP/WPF
ASAP
CRB(Competitive Releases)
DICER
FCM
GPMS - CopyRight
GPMS - MAGIC
GPMS - SCRY
GPMS - Titles and Registration
IntSales
Motion Pictures Portal
Script Tracker
SpiritWorld
Superbad
Worldwide Print Tracking System (WPTS)
Worldwide Publicity Website
Productions
C
Calypso
Dropzone
GPAS
Motion Pictures Production Database(MPPDB)
Tview
TV
BB
CC
Carmen
DealTracker
Dr. Oz
DTSM
ITSM/SARA
--------------------
--------------------
--------------------
Can you update me on the status of the invoice?
If not, please let me know.
Thanks,
Larry
Larry Marino

On Apr , , at : AM, Larry Marino < wrote:
Hi Larry,
Jane was hoping to see if we could get an invoice from ECS for this month's sales tax. We are working on a deal to make this work; the invoice is supposed to be CURRENCY,. I'm sorry but we won't be able to get a hard copy.
Thanks again!
Larry Marino

On Apr , , at : AM, Larry Marino < wrote:
Thanks Larry,
I hope that you are well.
I wanted to follow up with you briefly regarding the payment for the June invoice. I have received an invoice from ECS to make certain that we were billed by ECS for this year.
The invoice is supposed to be CURRENCY,.
The invoice is for May  through  and the invoice will be CURRENCY,.
Thanks again,
--------------------
--------------------
--------------------


In [21]:
# Generate text with detailed control
custom_text1 = generate_email("Laptop Male Software Engineer Cycling", use_pipeline=False)
custom_text2 = generate_email("Mobile Female Data Scientist Hiking", use_pipeline=False)
custom_text3 = generate_email("Desktop Male Graphic Designer Painting", use_pipeline=False)

# Print the generated custom texts
print(custom_text1)
print("--------------------")
print("--------------------")
print("--------------------")
print(custom_text2)
print("--------------------")
print("--------------------")
print("--------------------")
print(custom_text3)
print("--------------------")
print("--------------------")
print("--------------------")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Laptop Male Software Engineer Cycling Club
URL URL URL URL
_____
This e-mail message is intended only for the individual or entity to which it is addressed and may contain information that is privileged, confidential, or exempt from disclosure under applicable Federal or State law. If the reader of this e-mai-mail is not the intended recipient, or the employee or agent responsible for delivering the message to the intended recipients, you are hereby notified that any dissemination, distribution or copying of this communication is strictly prohibited. If you
--------------------
--------------------
--------------------
Mobile Female Data Scientist Hiking in the US
URL
--URL This message was sent by: BBC Worldwide Americas
 Avenue of the Americas New York, NY, 
This email is to provide you with a personal digital file of all emails from BBC Worldwide into your inbox. If you prefer not to continue receiving email communications, please unsubscribe here instead of replying to this email.


# V2 Detailed Output

Laptop Male Software Engineer Cycling Club
URL URL URL URL
_____
This e-mail message is intended only for the individual or entity to which it is addressed and may contain information that is privileged, confidential, or exempt from disclosure under applicable Federal or State law. If the reader of this e-mai-mail is not the intended recipient, or the employee or agent responsible for delivering the message to the intended recipients, you are hereby notified that any dissemination, distribution or copying of this communication is strictly prohibited. If you
--------------------
--------------------
--------------------
Mobile Female Data Scientist Hiking in the US
URL
--URL This message was sent by: BBC Worldwide Americas
 Avenue of the Americas New York, NY, 
This email is to provide you with a personal digital file of all emails from BBC Worldwide into your inbox. If you prefer not to continue receiving email communications, please unsubscribe here instead of replying to this email.
To update your profile and customize what email alerts and newsletters you receive, please click here.
Having
--------------------
--------------------
--------------------
Desktop Male Graphic Designer Painting -.
URL URL URL URL #
>
> -Houzz Logo
> //RESERVATION STAMPED HANDS AND LINE BLUES CURRENCY.
Image Credit: Kiki Bentonka
This is a photo message from my Houzz ideabook.
If you are having trouble viewing this, click here.
Share This: URL URL
URL
Thank You,
--------------------
--------------------
--------------------


# V1 Mode Output

<!-- # Laptop Male Software Engineer Cycling Team
# <NUMBER>-<PHONE>
# Email: <EMAIL> | Twitter: @Cycling_Team
# Cyclists are invited to participate in the Cycling Cycling World Cup in Brazil.
# The Cycling Federation of Brazil is a global organization dedicated to promoting cycling 
# and the development of sustainable living. Cycling is an international organization that promotes the health, 
# safety and well-being of all people. The Cycling Foundation is dedicated solely to the pursuit of the highest quality 
# and quality of life -->