# Lead Prioritization Model

This notebook builds a simple lead prioritization model to help sales teams focus on high-value opportunities.

## Objectives
1. Identify key factors that predict deal success
2. Build a scoring model to prioritize leads
3. Apply the model to current pipeline opportunities

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

pd.set_option('display.max_columns', None)

In [None]:
# Load the master dataset
df = pd.read_csv('../data/master_dataset.csv')
df['engage_date'] = pd.to_datetime(df['engage_date'])
df['close_date'] = pd.to_datetime(df['close_date'])

print(f"Total records: {len(df):,}")
print(f"Deal stages: {df['deal_stage'].value_counts().to_dict()}")

## 1. Prepare Training Data

We'll use historical closed deals (Won/Lost) to train our model.

In [None]:
# Filter to closed deals only for training
closed_df = df[df['deal_stage'].isin(['Won', 'Lost'])].copy()
print(f"Closed deals for training: {len(closed_df):,}")
print(f"Win/Loss ratio: {closed_df['is_won'].value_counts().to_dict()}")

In [None]:
# Select features for the model
# We'll use: sector, company_size, revenue_tier, product, regional_office

features = ['sector', 'company_size', 'revenue_tier', 'product', 'regional_office', 'series']

# Check for missing values
print("Missing values in features:")
print(closed_df[features].isnull().sum())

In [None]:
# Drop rows with missing feature values
model_df = closed_df.dropna(subset=features)
print(f"Records after dropping missing: {len(model_df):,}")

In [None]:
# Encode categorical variables
encoders = {}
X = pd.DataFrame()

for feature in features:
    le = LabelEncoder()
    X[feature] = le.fit_transform(model_df[feature])
    encoders[feature] = le

y = model_df['is_won'].values

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution: Won={y.sum()}, Lost={len(y)-y.sum()}")

## 2. Train the Model

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=20,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("Model trained successfully!")

In [None]:
# Evaluate model
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

print("=== MODEL PERFORMANCE ===")
print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_prob):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Lost', 'Won']))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(cm, cmap='Blues')

ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(['Lost', 'Won'])
ax.set_yticklabels(['Lost', 'Won'])
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix', fontsize=14, fontweight='bold')

for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', fontsize=16,
                color='white' if cm[i, j] > cm.max()/2 else 'black')

plt.colorbar(im)
plt.tight_layout()
plt.savefig('../visuals/07_confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Feature Importance Analysis

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("=== FEATURE IMPORTANCE ===")
print(feature_importance.to_string(index=False))

In [None]:
# Visualize feature importance
fig, ax = plt.subplots(figsize=(10, 6))

fi_sorted = feature_importance.sort_values('importance', ascending=True)
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(fi_sorted)))

ax.barh(fi_sorted['feature'], fi_sorted['importance'], color=colors)
ax.set_xlabel('Importance')
ax.set_title('Feature Importance for Lead Prioritization', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../visuals/08_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Create Lead Scoring Function

In [None]:
def score_lead(sector, company_size, revenue_tier, product, regional_office, series):
    """Score a lead based on the trained model."""
    try:
        input_data = pd.DataFrame([{
            'sector': encoders['sector'].transform([sector])[0],
            'company_size': encoders['company_size'].transform([company_size])[0],
            'revenue_tier': encoders['revenue_tier'].transform([revenue_tier])[0],
            'product': encoders['product'].transform([product])[0],
            'regional_office': encoders['regional_office'].transform([regional_office])[0],
            'series': encoders['series'].transform([series])[0]
        }])
        
        probability = rf_model.predict_proba(input_data)[0, 1]
        return round(probability * 100, 1)
    except Exception as e:
        return None

# Test the scoring function
test_score = score_lead('technology', 'Enterprise', 'Tier 4: $1B-$2.5B', 'GTXPro', 'Central', 'GTX')
print(f"Test lead score: {test_score}%")

## 5. Score Current Pipeline

In [None]:
# Get current pipeline (Prospecting and Engaging stages)
pipeline_df = df[df['deal_stage'].isin(['Prospecting', 'Engaging'])].copy()
print(f"Current pipeline opportunities: {len(pipeline_df):,}")

In [None]:
# Score each opportunity in the pipeline
def score_row(row):
    return score_lead(
        row['sector'],
        row['company_size'],
        row['revenue_tier'],
        row['product'],
        row['regional_office'],
        row['series']
    )

pipeline_df['lead_score'] = pipeline_df.apply(score_row, axis=1)

# Check for scoring failures
scored_count = pipeline_df['lead_score'].notna().sum()
print(f"Successfully scored: {scored_count:,} / {len(pipeline_df):,}")

In [None]:
# Create priority tiers
def assign_priority(score):
    if pd.isna(score):
        return 'Unscored'
    elif score >= 70:
        return 'High Priority'
    elif score >= 50:
        return 'Medium Priority'
    else:
        return 'Low Priority'

pipeline_df['priority'] = pipeline_df['lead_score'].apply(assign_priority)

print("=== PIPELINE PRIORITY DISTRIBUTION ===")
print(pipeline_df['priority'].value_counts())

In [None]:
# Top priority opportunities
top_opportunities = pipeline_df[pipeline_df['lead_score'].notna()].nlargest(20, 'lead_score')[
    ['opportunity_id', 'account', 'product', 'deal_stage', 'sector', 'company_size', 'sales_agent', 'lead_score', 'priority']
]

print("=== TOP 20 PRIORITY OPPORTUNITIES ===")
print(top_opportunities.to_string(index=False))

In [None]:
# Visualize pipeline by priority
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Priority distribution
priority_order = ['High Priority', 'Medium Priority', 'Low Priority', 'Unscored']
priority_counts = pipeline_df['priority'].value_counts().reindex(priority_order).fillna(0)
colors = ['#27ae60', '#f39c12', '#e74c3c', '#95a5a6']
axes[0].bar(priority_order, priority_counts, color=colors)
axes[0].set_ylabel('Number of Opportunities')
axes[0].set_title('Pipeline by Priority Tier', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)

# Lead score distribution
scored_pipeline = pipeline_df[pipeline_df['lead_score'].notna()]
axes[1].hist(scored_pipeline['lead_score'], bins=20, color='#3498db', edgecolor='white')
axes[1].axvline(x=70, color='#27ae60', linestyle='--', label='High Priority (70+)')
axes[1].axvline(x=50, color='#f39c12', linestyle='--', label='Medium Priority (50+)')
axes[1].set_xlabel('Lead Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Lead Score Distribution', fontsize=14, fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.savefig('../visuals/09_pipeline_priority.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Save Prioritized Pipeline

In [None]:
# Save the scored pipeline
output_cols = [
    'opportunity_id', 'account', 'product', 'deal_stage', 'sales_agent',
    'sector', 'company_size', 'revenue_tier', 'regional_office',
    'lead_score', 'priority'
]

pipeline_output = pipeline_df[output_cols].sort_values('lead_score', ascending=False)
pipeline_output.to_csv('../data/prioritized_pipeline.csv', index=False)

print("Prioritized pipeline saved to: data/prioritized_pipeline.csv")

## 7. Model Insights & Recommendations

In [None]:
print("="*60)
print("LEAD PRIORITIZATION MODEL - INSIGHTS & RECOMMENDATIONS")
print("="*60)

print("\n1. MODEL PERFORMANCE")
print(f"   - ROC-AUC Score: {roc_auc_score(y_test, y_prob):.3f}")
print(f"   - The model can distinguish between likely wins and losses")

print("\n2. KEY PREDICTIVE FACTORS (in order of importance)")
for _, row in feature_importance.iterrows():
    print(f"   - {row['feature']}: {row['importance']*100:.1f}%")

print("\n3. PIPELINE SUMMARY")
for priority in priority_order:
    count = (pipeline_df['priority'] == priority).sum()
    pct = count / len(pipeline_df) * 100
    print(f"   - {priority}: {count:,} opportunities ({pct:.1f}%)")

high_priority_count = (pipeline_df['priority'] == 'High Priority').sum()
print(f"\n4. RECOMMENDATIONS")
print(f"   - Focus sales efforts on {high_priority_count:,} high-priority opportunities")
print(f"   - {feature_importance.iloc[0]['feature'].title()} is the strongest predictor - segment accordingly")
print(f"   - Consider additional nurturing for medium-priority leads")
print(f"   - Review low-priority leads for qualification issues")

print("\n" + "="*60)