In [1]:
import numpy as np

template = """
Subject: {subject}

Dear {recipient},

{body}

Best regards,
{sender}
"""

# Define email categories and their characteristics based on the document
categories = {
    "student_inquiry": {
        "senders": ["undergraduate student", "graduate student"],
        "topics": [
            "academic progress",
            "course material",
            "assignment deadline",
            "grade inquiry",
            "financial aid",
            "course registration",
            "exam schedule",
            "scholarship information",
            "internship opportunities",
            "campus resources"
        ],
        "prompts": [
            "Write an email from a {sender} to a university HOD asking about {topic}.",
            "Compose an email to a university HOD where a {sender} is inquiring about {topic}.",
            "Generate an email to a university HOD from a {sender} requesting information on {topic}.",
            "Create an email to a university HOD from a {sender} seeking clarification on {topic}.",
            "Draft an email to a university HOD from a {sender} regarding {topic}."
        ],
    },
    "academic_collaboration": {
        "senders": ["researcher", "professor"],
        "topics": [
            "shared research data",
            "research facilities access",
            "cooperation opportunities",
            "co-authoring a paper",
            "organizing a conference",
            "guest lecture invitation",
            "interdisciplinary project",
            "academic exchange program"
        ],
        "sensitive_topics": [ 
                        "joint grant proposal",
                        "confidential government project",
                        "classified research data",
                        "patent application",
                        "proprietary information",
                        "memorandum of understanding",
                        "intellectual property rights",
        ],
        "prompts": [
            "Write an email from a {sender} to a university HOD proposing collaboration on {topic}.",
            "Compose an email inviting a university HOD to participate in {topic}.",
            "Generate an email to a university HOD requesting access to {topic}.",
            "Create an email to a university HOD suggesting a partnership for {topic}.",
            "Draft an email to a university HOD inquiring about the possibility of {topic}."
        ]
    },
    "corporate_inquiry": {
        "senders": ["HR manager", "CEO", "company representative", "recruiter"],
        "topics": [
            "internship program",
            "placement inquiries",
            "industry-academia partnership",
            "campus recruitment",
            "skill development workshop",
            "company presentation opportunity",
            "alumni networking event",
            "corporate training program",
            "research commercialization"
        ],
        "prompts": [
            "Write an email from a {sender} to a university HOD inquiring about {topic}.",
            "Compose an email from a {sender} to a university HOD proposing a {topic} initiative.",
            "Generate an email from a {sender} to a university HOD seeking information on {topic}.",
            "Create an email sent by a {sender} to a university HOD requesting details about the university's {topic}.",
            "Draft an email sent by a {sender} to a university HOD exploring opportunities for {topic}."
        ]
    }
}

def generate_topic(category, sensitive):
    cat = categories[category]
    if category == 'student_inquiry':
        return np.random.choice(cat["topics"])
    elif category == 'academic_collaboration':
        if sensitive: 
            return np.random.choice(cat["sensitive_topics"])
        else:
            return np.random.choice(cat["topics"])
    else:
        return np.random.choice(cat["topics"])

def generate_prompts(num_batches):
    dist = np.random.uniform(0,3,num_batches)
    cats = np.where(dist < 1, 'student_inquiry', np.where(dist < 2, 'academic_collaboration', 'corporate_inquiry'))
    sensitives = np.where(dist < 1.8, False, True)
    prompts = []
    for i in range(num_batches):
        category = cats[i]
        sensitive = sensitives[i]
        cat = categories[category]
        sender = np.random.choice(cat["senders"])
        topic = generate_topic(category, sensitive)
        prompt_template = np.random.choice(cat["prompts"])
        prompts.append((prompt_template.format(sender=sender, topic=topic), category, sensitive))
    return prompts


In [8]:
from groq import Groq
import os
os.environ["GROQ_API_KEY"] = "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"
client = Groq()
def generate_batch(prompt):
    completion = client.chat.completions.create( 
    model="llama3-8b-8192",
    messages=[
        {"role": "system", "content": '''You are an AI assistant that generates email content. Provide only the subject line and body as requested and do not use placeholders 
         instead use realistic possible values (try to randomize names more), without any additional text or explanations. Students generally send mails regarding publiclically available 
         information (on university repositories) while researchers/corporates may send containing potentially sensitive information depending on the topic mentioned in the prompt. 
         Adapt accordingly depending on the prompt. Students generally seek information regarding academic progress, course material and other queries. Corporates may seek information 
         regarding internship or placement drives, industry - academia collaboration, training workshops etc. Researchers may seek
         collaboration opportunities, access to research facilities, co-authoring papers etc. Adhere to the sender and topic mentioned in the prompt.
         Each email should have the following structure:
        [START EMAIL]
        SUBJECT: <email subject>
        BODY: <email body of around 90 words>
        [END EMAIL]
        Ensure that each email is separated by the [START EMAIL] and [END EMAIL] tags.
        Always generate 10 such emails.   
        '''},
        {"role": "user", "content": prompt}
    ],
    max_tokens=2000,
    temperature=0.8,   
    )
    return completion.choices[0].message.content.strip()

def parse_mails(text):
    email_pattern = r'\[START EMAIL\](.*?)\[END EMAIL\]'
    emails = re.findall(email_pattern, text, re.DOTALL)
    parsed = []
    for email in emails:
        subject_match = re.search(r'SUBJECT: (.+)', email)
        body_match = re.search(r'BODY:\s*([\s\S]+)', email)
        if subject_match and body_match:
            parsed.append({
                "subject": subject_match.group(1).strip(),
                "body": body_match.group(1).strip()
            })
    return parsed


In [11]:
import pandas as pd
import re
import time
def retry(prompt):
    i = 0
    try:
        return generate_batch(prompt)
    except:
        while(i < 8):
            print(f"Attempt {i+1} failed. Retrying...")
            time.sleep(15*2**i)
            try:
                return generate_batch(prompt)
            except:
                pass
            i += 1

def generate_dataset(num_batches):
    synth = pd.DataFrame(columns=['subject','body','category','sensitive'])
    prompts = generate_prompts(num_batches)
    for i in range(len(prompts)):
        prompt, category, sensitive = prompts[i]
        generated_text = retry(prompt)
        emails = parse_mails(generated_text)
        for email in emails:
            row = pd.DataFrame([{'subject': email['subject'], 'body': email['body'], 'category': category, 'sensitive': sensitive}])
            synth = pd.concat([synth, row], ignore_index=True)
    return synth

In [None]:
synth = generate_dataset(230)
synth.head()

In [19]:
path = r'C:\Users\abhin\OneDrive\Documents\Cpp\synth_data.csv'
synth.to_csv(path, index=False)

In [17]:
print(len(final))
print(final['category'].value_counts()/len(final))
print(final['sensitive'].value_counts()/len(final))

2551
category
student_inquiry           0.352411
academic_collaboration    0.328891
corporate_inquiry         0.318699
Name: count, dtype: float64
sensitive
False    0.662485
True     0.337515
Name: count, dtype: float64
