In [1]:
import pandas as pd
import json
import torch
import re
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 1. CONFIGURATION ---
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
OUTPUT_DIR = "mistral-7b-enron-email-finetune"
SAMPLES_TO_USE = 5000

In [3]:
# System Prompt defines the output style and structure the model should learn.
SYS_PROMPT = """
You write short, polite B2B cold emails.

Your job:
- Read the lead_profile and style_profile I give you.
- Then write a ready-to-send cold email for that specific person.

Hard rules:
- 60â€“120 words.
- One clear call to action.
- No links in the first email.
- Use ONLY information from lead_profile. Do not invent facts.
- Use greeting and closing from style_profile.
- DO NOT output instructions, guidelines, or placeholders.
- DO NOT explain what you are doing.

Output format (VERY IMPORTANT):
Return ONLY a single JSON object with this exact shape:
{"subject": "<short subject line>", "body": "<full email body text>"}
"""

# Placeholders simulate the input data structure for fine-tuning.
placeholder_lead_profile = {
  "name": "{first_name} {last_name}",
  "role": "{role}",
  "company": "{company}",
  "objective": "Write a professional email based on the Enron corpus style.",
}
placeholder_style_profile = {
  "greeting": "Dear {first_name},",
  "closing": "Sincerely,\\n{sender_name}",
  "assertiveness": "medium",
  "formality": "high"
}

In [4]:
# --- 2. DATA LOADING, SAMPLING, AND FORMATTING ---

print(f"Loading and sampling {SAMPLES_TO_USE} records...")
try:
    df = pd.read_csv('filtered_cleaned_enron.csv')
    sampled_df = df.sample(n=SAMPLES_TO_USE, random_state=42).copy()
except FileNotFoundError:
    print("Error: 'filtered_cleaned_enron.csv' not found.")
    exit()
except ValueError:
    print(f"Error: Dataset size is less than {SAMPLES_TO_USE}. Using all available data.")
    sampled_df = df.copy()

sampled_df['cleaned_body'] = sampled_df['cleaned_body'].astype(str)
sampled_df['subject'] = sampled_df['subject'].astype(str)

Loading and sampling 5000 records...


In [5]:
def format_enron_to_mistral(row):
    instruction = (
      SYS_PROMPT + "\n\n" +
      "lead_profile = " + json.dumps(placeholder_lead_profile, ensure_ascii=False) + "\n" +
      "style_profile = " + json.dumps(placeholder_style_profile, ensure_ascii=False)
    )

    # Note: Newlines are escaped for valid JSON string response
    response_json = {
        "subject": row['subject'],
        "body": row['cleaned_body'].replace('\n', '\\n').replace('\r', '')
    }
    response = json.dumps(response_json, ensure_ascii=False)

    # Apply the Mistral instruction template
    formatted_text = f"<s>[INST] {instruction} [/INST] {response} </s>"
    return {"text": formatted_text}

dataset = sampled_df.apply(format_enron_to_mistral, axis=1, result_type='expand')
hf_dataset = Dataset.from_pandas(dataset)
print(f"Dataset ready with {len(hf_dataset)} examples.")

Dataset ready with 5000 examples.


In [6]:
# --- 3. QLoRA and Model Configuration ---

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print("Loading model for QLoRA fine-tuning...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
))

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading model for QLoRA fine-tuning...


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:12<00:00,  4.06s/it]



In [8]:
# --- 4. OPTIMIZED TRAINING ARGUMENTS ---

hf_dataset = hf_dataset.map(lambda b: tokenizer(b["text"], truncation=True, max_length=768, padding="max_length"), batched=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    learning_rate=2e-5,
    logging_steps=50,
    save_strategy="epoch",
    push_to_hub=False,
    fp16=False,
    bf16=True,
)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:01<00:00, 4004.19 examples/s]



In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=hf_dataset,
    peft_config=model.peft_config[list(model.peft_config.keys())[0]], # Get the LoRA config
    processing_class=tokenizer,
    args=training_args,
)

Truncating train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:00<00:00, 455516.41 examples/s]
Truncating train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:00<00:00, 455516.41 examples/s]


In [11]:
print("Starting fine-tuning...")
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Starting fine-tuning...


  return fn(*args, **kwargs)


Step,Training Loss
50,1.1825
100,0.7004
150,0.6854
200,0.6469
250,0.6227
300,0.6331


  return fn(*args, **kwargs)


TrainOutput(global_step=314, training_loss=0.7403470604283036, metrics={'train_runtime': 22244.0644, 'train_samples_per_second': 0.45, 'train_steps_per_second': 0.014, 'total_flos': 3.2959195250688e+17, 'train_loss': 0.7403470604283036, 'entropy': 0.6375446280218521, 'num_tokens': 7680000.0, 'mean_token_accuracy': 0.8600618704310004, 'epoch': 2.0})

In [12]:
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\nTraining complete. Model and tokenizer saved locally to {OUTPUT_DIR}.")


Training complete. Model and tokenizer saved locally to mistral-7b-enron-email-finetune.


In [21]:
## ðŸ§ª Model Testing Script (Sales/Product Pitch Scenario)

import json
import torch

# --- 1. Define Test Data ---
# Scenario: Pitching a new software product to a potential client
test_lead_profile = {
  "name": "Michael Chen",
  "role": "Head of Operations",
  "company": "LogisticsFast",
  # The objective here defines the sales goal
  "objective": "Propose a demo of our 'RouteOptimizer' software to help reduce their delivery times by 20%.",
}

test_style_profile = {
  "greeting": "Hi Michael,",
  "closing": "Cheers,\nDavid",
  "assertiveness": "high",  # Higher assertiveness for a sales pitch
  "formality": "medium"
}

# --- 2. Format Input for Model ---
if 'SYS_PROMPT' not in locals():
    SYS_PROMPT = """
You write short, polite B2B cold emails.

Your job:
- Read the lead_profile and style_profile I give you.
- Then write a ready-to-send cold email for that specific person.

Hard rules:
- Email should be less that 120 words.
- One clear call to action.
- No links in the first email.
- Use ONLY information from lead_profile. Do not invent facts.
- Use greeting and closing from style_profile.
- DO NOT output instructions, guidelines, or placeholders.
- DO NOT explain what you are doing.

Output format (VERY IMPORTANT):
Return ONLY a single JSON object with this exact shape:
{"subject": "<short subject line>", "body": "<full email body text>"}
"""

instruction = (
  SYS_PROMPT + "\n\n" +
  "lead_profile = " + json.dumps(test_lead_profile, ensure_ascii=False) + "\n" +
  "style_profile = " + json.dumps(test_style_profile, ensure_ascii=False)
)

formatted_input = f"<s>[INST] {instruction} [/INST]"

# --- 3. Generate Email ---
print("\n--- Generating Sales Email with Fine-tuned Model ---\n")

model_input = tokenizer(formatted_input, return_tensors="pt").to(model.device)

# --- FIX: Cast lm_head to float32 ---
# Ensures weights match the input tensor type to prevent RuntimeError
if hasattr(model, "base_model") and hasattr(model.base_model, "lm_head"):
    model.base_model.lm_head.to(torch.float32)
elif hasattr(model, "lm_head"):
    model.lm_head.to(torch.float32)

model.eval()

with torch.no_grad():
    generated_ids = model.generate(
        **model_input,
        max_new_tokens=600,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )

full_output = tokenizer.decode(generated_ids[0], skip_special_tokens=False)

# --- 4. Post-process and Print ---
try:
    response_start_index = full_output.rfind('[/INST]')
    if response_start_index != -1:
        raw_json_output = full_output[response_start_index + len('[/INST]'):].strip()
        
        if raw_json_output.endswith('</s>'):
            raw_json_output = raw_json_output[:-4].strip()

        try:
            email_data = json.loads(raw_json_output)
            print("Successfully Parsed JSON Output:")
            print("Subject:", email_data.get('subject', 'N/A'))
            print("---")
            body_text = email_data.get('body', 'N/A').replace('\\n', '\n')
            print(body_text)
            print("---\n")
        except json.JSONDecodeError:
            print("Error: Failed to parse model output as JSON.")
            print("Raw Output:", raw_json_output)
    else:
        print("Error: Could not find the instruction closing tag '[/INST]' in the output.")
        print("Full Model Output:", full_output)
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")
    print("Full Model Output:", full_output)


--- Generating Sales Email with Fine-tuned Model ---

Error: Failed to parse model output as JSON.
Raw Output: {"subject": "Re: Route Optimization", "body": "Hi David,\n\t\tI'm interested in the Route Optimization. I'll have my team look at the demo and we'll get back to you with our thoughts.\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\
Error: Failed to parse model output as JSON.
Raw Output: {"subject": "Re: Route Optimization", "body": "Hi David,\n\t\tI'm interested in the Route Optimizati