In [1]:
%pip install transformers datasets torch scikit-learn accelerate spacy evaluate nltk rouge_score

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('LightTai/personalized-email')

In [3]:
print(dataset)
print(dataset['train'][:5])
df = dataset['train'].to_pandas()
print(df.head())
print(df.shape)

DatasetDict({
    train: Dataset({
        features: ['product', 'gender', 'profession', 'hobby', 'email'],
        num_rows: 30
    })
})
{'product': ['piano lessons', 'guitar lessons', 'vacation plans', 'vacation plans', 'vacation plans'], 'gender': ['male', 'male', 'male', 'female', 'female'], 'profession': ['college students', 'college students', 'college students', 'college students', 'company employees'], 'hobby': ['like to play piano', 'like to play piano', 'like swimming', 'like to look at the scenery', 'like to look at the scenery'], 'email': ["Subject: Elevate Your Piano Skills - Exclusive Offer Inside!\n\nHey [Name],\n\nLooking to unlock your piano potential? As a fellow male college student and a passionate piano player, I understand your love for music. That's why I'm thrilled to offer you exclusive piano lessons designed to fit your busy student schedule.\n\nMaster your favorite melodies, refine techniques, and gain a deeper understanding of music theory-all while enjoyin

In [4]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset

model_checkpoint = "24NLPGroupO/EmailGeneration"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize_function(examples):
    features = [f"{prod} {gen} {prof} {hob}" 
                for prod, gen, prof, hob in zip(examples["product"], 
                                                examples["gender"], 
                                                examples["profession"], 
                                                examples["hobby"])]
    tokenized_inputs = tokenizer(features, truncation=True, padding="max_length", max_length=512)

    # Tokenize the email column which is our target
    tokenized_targets = tokenizer(examples["email"], truncation=True, padding="max_length", max_length=512)

    tokenized_inputs['labels'] = tokenized_targets['input_ids']  # Assign target token ids as labels for training
    return tokenized_inputs


# Assuming 'df' is your DataFrame and it's already loaded
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the DataFrames back to Hugging Face dataset format if necessary
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["product", "gender", "profession", "hobby", "email"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["product", "gender", "profession", "hobby", "email"])
# Assuming 'dataset' is a Hugging Face 'datasets' object
# tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["product", "gender", "profession", "hobby", "email"])


Map (num_proc=4):   0%|          | 0/24 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6 [00:00<?, ? examples/s]

In [5]:
print(tokenized_train_dataset)

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 24
})


In [6]:
tokenized_train_dataset[1]

{'__index_level_0__': 24,
 'input_ids': [6966,
  27757,
  4257,
  4409,
  3504,
  12,
  1886,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  5025

In [7]:
import evaluate
import torch
import numpy as np
rouge = evaluate.load('rouge')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits from numpy arrays to PyTorch tensors
    logits_tensor = torch.tensor(logits)
    
    # Convert logits to predicted token IDs using argmax
    predictions_ids = torch.argmax(logits_tensor, dim=-1)

    # Decode predictions and labels
    predictions = tokenizer.batch_decode(predictions_ids, skip_special_tokens=True)
    # Assuming labels are already decoded; if not, decode them similarly
    references = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute ROUGE scores
    results = rouge.compute(predictions=predictions, references=references)

    return results

In [8]:

# test_predictions = tokenized_train_dataset[1]["input_ids"]
# test_references = tokenized_train_dataset[1]["labels"]
# test_eval_pred = (test_predictions, test_references)
# rouge_results = compute_metrics(test_eval_pred)
# print(rouge_results)

In [9]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir="./model_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
		compute_metrics=compute_metrics
)

trainer.train()

# Save the model and tokenizer after training is complete
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 3.8731205463409424, 'eval_rouge1': 0.03449665235410492, 'eval_rouge2': 0.0, 'eval_rougeL': 0.030741030841984963, 'eval_rougeLsum': 0.03161879278519639, 'eval_runtime': 25.4247, 'eval_samples_per_second': 0.236, 'eval_steps_per_second': 0.079, 'epoch': 1.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 3.520601511001587, 'eval_rouge1': 0.0555936816349357, 'eval_rouge2': 0.0, 'eval_rougeL': 0.04812053494734674, 'eval_rougeLsum': 0.055684003632858296, 'eval_runtime': 35.4851, 'eval_samples_per_second': 0.169, 'eval_steps_per_second': 0.056, 'epoch': 2.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 3.451094388961792, 'eval_rouge1': 0.05036106701758942, 'eval_rouge2': 0.0, 'eval_rougeL': 0.045725141574365524, 'eval_rougeLsum': 0.044415482322762156, 'eval_runtime': 51.5975, 'eval_samples_per_second': 0.116, 'eval_steps_per_second': 0.039, 'epoch': 3.0}


KeyboardInterrupt: 

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
print(eval_results)

100%|██████████| 2/2 [00:02<00:00,  1.07s/it]

Perplexity: 14.17
{'eval_loss': 2.6512813568115234, 'eval_rouge1': 0.04014002327125941, 'eval_rouge2': 0.0, 'eval_rougeL': 0.04011816821399916, 'eval_rougeLsum': 0.03753779288902849, 'eval_runtime': 4.5731, 'eval_samples_per_second': 1.312, 'eval_steps_per_second': 0.437, 'epoch': 10.0}





In [None]:
# model.push_to_hub("24NLPGroupO/EmailGeneration")

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import re

# Load saved model and tokenizer
model_checkpoint = "24NLPGroupO/EmailGeneration"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, truncation=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# Set up the generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

def clean_generated_text(text):
    #Basic cleaning
    text = re.sub(r'^(Re:|Fwd:)', '', text)                         # Remove reply and forward marks
    text = re.sub(r'Best regards,.*$', '', text, flags=re.DOTALL)   # Remove signature
    text = re.sub(r'PHONE.*$', '', text, flags=re.DOTALL)           # Remove phone numbers
    text = re.sub(r'Email:.*$', '', text, flags=re.DOTALL)          # Remove email addresses
    text = re.sub(r'Cc:.*$', '', text, flags=re.DOTALL)             # Remove CC list
    text = re.sub(r'\* Attachments:.*', '', text, flags=re.S)       # Remove Attachments
    text = re.sub(r'©️ .*$', '', text, flags=re.DOTALL)              # Remove copyright and ownership statements
    text = re.sub(r'URL', '', text)                                 # Remove URLs
    text = re.sub(r'NUMBER', '10', text)                            # Replace 'NUMBER' with a real number
    text = re.sub(r'CURRENCYNUMBER', 'USD 100', text)               # Replace 'CURRENCYNUMBER' with a real value
    text = re.sub(r'About Us.*', '', text, flags=re.DOTALL)         # Remove 'About Us' and all following text
    text = re.sub(r'\d+ [^\s]+ St\.?,?.*?\d{5}', '', text)          # Remove street
    text = re.sub(r'\d+ [^\s]+ Ave\.?,?.*?\d{5}', '', text)         # Remove avenues
    text = re.sub(r'\d+ [^\s]+ Rd\.?,?.*?\d{5}', '', text)          # Remove roads
    text = re.sub(r'\d+ [^\s]+ Ln\.?,?.*?\d{5}', '', text)          # Remove lanes
    text = re.sub(r'\d+ [^\s]+ Blvd\.?,?.*?\d{5}', '', text)        # Remove boulevards
    text = re.sub(r'\d+ [^\s]+ Dr\.?,?.*?\d{5}', '', text)          # Remove drives
    text = re.sub(r'\d+ [^\s]+ Ct\.?,?.*?\d{5}', '', text)          # Remove courts
    return text.strip()

def generate_email(product, gender, profession, hobby):
    input_text = f"{product} {gender} {profession} {hobby}"
    result = generator(
        input_text,                # The starting text that guides the model on what to generate
        max_length=256,            # Set a suitable maximum length
        top_k=40,                  # Consider more top options words
        top_p=0.6,                 # Control the probability range for word choices
        temperature=0.4,           # Control the randomness of generation
        repetition_penalty=1.5,    # Reduce content repetition
        num_return_sequences=2,    # Generate three texts
        do_sample=True
    )       
    # Clean each generated text
    cleaned_texts = [clean_generated_text(seq['generated_text']) for seq in result]
    # Choose the best text based on length and clarity
    best_text = max(cleaned_texts, key=len)
    return best_text

# Example parameters and generation
product = "Laptop"
gender = "Male"
profession = "Software Engineer"
hobby = "Gaming"

email_text = generate_email(product, gender, profession, hobby)
print("Generated Email:")
print(email_text)
print("--------------------")


TypeError: __init__() got an unexpected keyword argument 'use_fast'