In [18]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

job_titles = [
    "CEO", "CTO", "CFO", "Founder", "VP of Sales", "Director of Marketing",
    "Software Engineer", "Data Scientist", "Product Manager", "Intern",
    "Sales Associate", "HR Manager", "Senior Developer", "Junior Developer"
]

def generate_synthetic_leads(num=100):
    leads = []
    for _ in range(num):
        title = random.choice(job_titles)
        company_size = random.randint(5, 500)
        email_valid = random.choices([1, 0], weights=[0.9, 0.1])[0]
        label = 1 if ("CEO" in title or "CTO" in title or "Founder" in title or
                      "VP" in title or company_size > 100) else 0
        leads.append({
            "job_title": title,
            "company_size": company_size,
            "email_valid": email_valid,
            "label": label
        })
    return leads

leads = generate_synthetic_leads(200)

GLOVE_PATH = "glove.6B.50d.txt"

def load_glove_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, "r", encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

print("Loading GloVe embeddings... This might take a minute.")
glove_embeddings = load_glove_embeddings(GLOVE_PATH)
print(f"Loaded {len(glove_embeddings)} word vectors.")

def embed_job_title(title, embeddings, embedding_dim=50):
    words = title.lower().split()
    vectors = [embeddings[word] for word in words if word in embeddings]
    if not vectors:
        return np.zeros(embedding_dim)
    return np.mean(vectors, axis=0)

class LeadDataset(Dataset):
    def __init__(self, leads, embeddings):
        self.leads = leads
        self.embeddings = embeddings

    def __len__(self):
        return len(self.leads)

    def __getitem__(self, idx):
        lead = self.leads[idx]
        title_emb = embed_job_title(lead['job_title'], self.embeddings)
        company_size = lead['company_size'] / 500
        email_valid = lead['email_valid']
        features = np.concatenate([title_emb, [company_size, email_valid]])
        label = lead['label']
        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

class LeadScoringModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x.squeeze()

def train_model(model, dataset, epochs=40, batch_size=16):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        total_loss = 0
        for features, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if epoch % 5 == 0:
            print(f"Epoch {epoch}, Loss: {total_loss:.4f}")


dataset = LeadDataset(leads, glove_embeddings)
input_dim = 50 + 2
model = LeadScoringModel(input_dim)

train_model(model, dataset)

def predict_lead_score(model, lead, embeddings):
    model.eval()
    title_emb = embed_job_title(lead['job_title'], embeddings)
    company_size = lead['company_size'] / 500
    email_valid = lead['email_valid']
    features = np.concatenate([title_emb, [company_size, email_valid]])
    features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        score = model(features_tensor).item()
    return score

test_lead = {
    "job_title": "Director of Marketing",
    "company_size": 120,
    "email_valid": 1
}

score = predict_lead_score(model, test_lead, glove_embeddings)
print(f"Predicted lead quality score for test lead: {score:.4f}")
torch.save(model.state_dict(), "lead_scoring_model.pth")
print("Model saved to lead_scoring_model.pth")


Loading GloVe embeddings... This might take a minute.
Loaded 400000 word vectors.
Epoch 0, Loss: 7.6669
Epoch 5, Loss: 4.2316
Epoch 10, Loss: 3.7043
Epoch 15, Loss: 3.4316
Epoch 20, Loss: 2.7768
Epoch 25, Loss: 2.3559
Epoch 30, Loss: 1.7781
Epoch 35, Loss: 1.3321
Predicted lead quality score for test lead: 0.7057
Model saved to lead_scoring_model.pth


In [20]:
import os
import re
import torch
import numpy as np
import torch.nn as nn

class LeadScoringModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x.squeeze()

GLOVE_PATH = "glove.6B.50d.txt"

def load_glove_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, "r", encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def embed_job_title(title, embeddings, embedding_dim=50):
    words = title.lower().split()
    vectors = [embeddings[word] for word in words if word in embeddings]
    if not vectors:
        return np.zeros(embedding_dim)
    return np.mean(vectors, axis=0)

input_dim = 50 + 2

model = LeadScoringModel(input_dim)
model.load_state_dict(torch.load("lead_scoring_model.pth"))
model.eval()

print("Model loaded successfully.")

sample_leads = [
    {
        "name": "Alice Johnson",
        "email": "alice.johnson@example.com",
        "job_title": "VP of Sales",
        "company": "Tech Innovators",
        "company_size": 150,
        "domain": "example.com"
    },
    {
        "name": "Bob Smith",
        "email": "bob.smith@example.org",
        "job_title": "Software Engineer",
        "company": "CodeWorks",
        "company_size": 30,
        "domain": "example.org"
    },
    {
        "name": "Carol Evans",
        "email": "carol.evans@startup.io",
        "job_title": "CEO",
        "company": "Startup.io",
        "company_size": 10,
        "domain": "startup.io"
    },
    {
        "name": "Duplicate Bob",
        "email": "bob.smith@example.org",
        "job_title": "Senior Developer",
        "company": "CodeWorks",
        "company_size": 30,
        "domain": "example.org"
    }
]

def is_valid_email(email: str) -> bool:
    regex = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
    return re.match(regex, email) is not None

def score_lead_ai(lead: dict, model, embeddings) -> float:
    title_emb = embed_job_title(lead.get('job_title', ''), embeddings)
    company_size = lead.get('company_size', 0) / 500
    email_valid = 1 if is_valid_email(lead.get('email', '')) else 0

    features = np.concatenate([title_emb, [company_size, email_valid]])
    features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

    with torch.no_grad():
        score = model(features_tensor).item()
    return score

def deduplicate_leads(leads: list) -> list:
    seen_emails = set()
    unique_leads = []
    for lead in leads:
        email = lead.get('email')
        if email and email not in seen_emails:
            seen_emails.add(email)
            unique_leads.append(lead)
    return unique_leads

def process_leads_ai(leads: list, model, embeddings) -> list:
    unique = deduplicate_leads(leads)
    for lead in unique:
        lead['score'] = score_lead_ai(lead, model, embeddings)
    return sorted(unique, key=lambda x: x['score'], reverse=True)


print("Loading GloVe embeddings...")
glove_embeddings = load_glove_embeddings(GLOVE_PATH)
print(f"Loaded {len(glove_embeddings)} GloVe vectors.")

processed_leads = process_leads_ai(sample_leads, model, glove_embeddings)
for lead in processed_leads:
    print(f"{lead['name']} ({lead['email']}): Score={lead['score']:.4f}, "
          f"Title={lead['job_title']}, Company={lead['company']}, Size={lead['company_size']}")


Model loaded successfully.
Loading GloVe embeddings...
Loaded 400000 GloVe vectors.
Alice Johnson (alice.johnson@example.com): Score=0.9992, Title=VP of Sales, Company=Tech Innovators, Size=150
Carol Evans (carol.evans@startup.io): Score=0.9536, Title=CEO, Company=Startup.io, Size=10
Bob Smith (bob.smith@example.org): Score=0.1310, Title=Software Engineer, Company=CodeWorks, Size=30
