In [1]:
import pandas as pd

In [19]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import nltk
import numpy as np

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the email dataset
emails = pd.read_csv('emails.csv')  # Ensure the CSV has a column 'EmailContent'

# Preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Preprocess emails
emails['Processed'] = emails['EmailContent'].apply(preprocess_text)

# Vectorize using CountVectorizer
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
email_vectors = vectorizer.fit_transform(emails['Processed'])

# Fit LDA model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(email_vectors)

# Function to get the most relevant topic for an email
feature_names = vectorizer.get_feature_names_out()

def get_topic_for_email(email, lda_model, vectorizer):
    email_vector = vectorizer.transform([email])
    topic_distribution = lda_model.transform(email_vector)
    topic_idx = np.argmax(topic_distribution)
    topic_words = [feature_names[i] for i in lda_model.components_[topic_idx].argsort()[:-10 - 1:-1]]
    return topic_idx, topic_words

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

def generate_email_with_gpt2(prompt, max_length=200):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, 
                             pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Integrated function to generate email
def generate_email(email_text, company_name, job_title):
    # Get topic from LDA
    topic_idx, topic_words = get_topic_for_email(email_text, lda_model, vectorizer)
    topic_summary = ", ".join(topic_words[:5])  # Use top 5 words for brevity

    # Define prompt for GPT-2
    prompt = f"""
    Write a professional networking email:

    Subject: Connecting with {company_name}

    Hi [Recipient Name],

    I am reaching out to express my admiration for your role as a {job_title} at {company_name}. 
    It's fascinating how {company_name} is driving innovation in areas like {topic_summary}.

    I'd love to hear more about your experience and insights. Please let me know if you’d be open to a quick chat.

    Best regards,  
    [Your Name]  
    [Your Contact Information]
    """

    # Generate email using GPT-2
    generated_email = generate_email_with_gpt2(prompt, max_length=300)
    return generated_email

# Example usage with a sample email from the CSV
sample_email = emails['Processed'].iloc[0]  # Get the first preprocessed email
company_name = "ABC Corp"
job_title = "Investment Analyst"

generated_email = generate_email(sample_email, company_name, job_title)
print(generated_email)

ModuleNotFoundError: No module named 'transformers'