In [11]:
from datetime import datetime, timedelta

import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_users = pd.read_csv('../data/random-users.csv')

In [12]:
def prepare_lead_scoring_data(df, train_end_date='2023-06-30', val_end_date='2023-09-30'):
    df = df.copy()
    
    # Convert dates to datetime
    df['signup_date'] = pd.to_datetime(df['signup_date'])
    df['conversion_date'] = pd.to_datetime(df['conversion_date'])

    # Create time-based splits
    train_mask = df['signup_date'] < train_end_date
    val_mask = (df['signup_date'] >= train_end_date) & (df['signup_date'] < val_end_date)
    test_mask = df['signup_date'] >= val_end_date
    
    return df[train_mask], df[val_mask], df[test_mask]

In [19]:
df_train, df_val, df_test = prepare_lead_scoring_data(
    df_users,
    train_end_date='2024-03-01',
    val_end_date='2024-03-15'
)

In [34]:
from sklearn.feature_extraction import DictVectorizer

def prepare_features(df, dv=None, fit=True):
    """
    Prepare features using DictVectorizer, excluding ID and date columns
    
    Parameters:
    df: pandas DataFrame
    dv: DictVectorizer instance (optional)
    fit: boolean, whether to fit the DictVectorizer or just transform
    
    Returns:
    X: sparse matrix of features
    dv: fitted DictVectorizer
    feature_names: list of feature names
    """
    # Create a copy to avoid modifying the original
    df = df.copy()
    
    # Identify columns to exclude
    date_columns = ['signup_date', 'conversion_date']
    exclude_columns = ['user_id', 'converted'] + date_columns
    
    # Convert DataFrame to list of dictionaries, excluding specified columns
    feature_dict = df.drop(columns=exclude_columns).to_dict(orient='records')
    
    # Initialize DictVectorizer if not provided
    if dv is None:
        dv = DictVectorizer(sparse=True)
    
    # Fit or transform
    if fit:
        X = dv.fit_transform(feature_dict)
    else:
        X = dv.transform(feature_dict)
    
    return X, dv

# Usage example
X_train, dv = prepare_features(df_train, fit=True)
X_val, _ = prepare_features(df_val, dv=dv, fit=False)
X_test, _ = prepare_features(df_test, dv=dv, fit=False)

# Get target variable
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Initialize and train the model
model = LogisticRegression(max_iter=1000, random_state=1)
model.fit(X_train, y_train)

# Get predictions
train_pred = model.predict_proba(X_train)[:, 1]
val_pred = model.predict_proba(X_val)[:, 1]

# Calculate AUC
train_auc = roc_auc_score(y_train, train_pred)
val_auc = roc_auc_score(y_val, val_pred)

print(f'Train AUC: {train_auc:.3f}')
print(f'Validation AUC: {val_auc:.3f}')

# Let's also look at the most important features
feature_importance = pd.DataFrame({
    'feature': dv.feature_names_,
    'importance': np.abs(model.coef_[0])
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print('\nTop 10 most important features:')
print(feature_importance.head(10))

Train AUC: 0.960
Validation AUC: 0.488

Top 10 most important features:
                                                feature  importance
469   required_features=Compliance Training, Assessm...    1.170400
1221  required_features=Virtual Classroom, Complianc...    1.129081
448   required_features=Compliance Training, Analyti...    1.126492
592   required_features=Content Library, Analytics &...    1.118366
156   required_features=Analytics & Reporting, Mobil...    0.839681
371   required_features=Certification Management, Co...    0.816484
1038  required_features=Mobile Learning, Analytics &...    0.815070
543   required_features=Compliance Training, Integra...    0.805830
146   required_features=Analytics & Reporting, Mobil...    0.804335
284   required_features=Assessment Tools, Virtual Cl...    0.798030


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Define parameter grid
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

# Try each combination and track results
results = []

for C in param_grid['C']:
    for solver in param_grid['solver']:
        for penalty in param_grid['penalty']:
            # Skip invalid combinations
            if solver == 'liblinear' and penalty not in ['l1', 'l2']:
                continue

            print(C, solver, penalty)
            # Train model
            model = LogisticRegression(
                C=C, 
                solver=solver,
                penalty=penalty,
                random_state=1,
                max_iter=1000
            )
            model.fit(X_train, y_train)
            
            # Get predictions
            train_pred = model.predict_proba(X_train)[:, 1]
            val_pred = model.predict_proba(X_val)[:, 1]
            
            # Calculate AUC
            train_auc = roc_auc_score(y_train, train_pred)
            val_auc = roc_auc_score(y_val, val_pred)
            print(train_auc, val_auc)
            print()
            
            results.append({
                'C': C,
                'solver': solver,
                'penalty': penalty,
                'train_auc': train_auc,
                'val_auc': val_auc
            })

# Convert results to DataFrame for easy analysis
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('val_auc', ascending=False)

print("Top 5 models by validation AUC:")
print(results_df.head())

# Get best parameters
best_params = results_df.iloc[0]
print("\nBest parameters:")
print(f"C={best_params['C']}, solver={best_params['solver']}, penalty={best_params['penalty']}")
print(f"Validation AUC: {best_params['val_auc']:.3f}")

# Train final model with best parameters
best_model = LogisticRegression(
    C=best_params['C'],
    solver=best_params['solver'],
    penalty=best_params['penalty'],
    random_state=1,
    max_iter=1000
)
best_model.fit(X_train, y_train)

0.0001 liblinear l1
0.5 0.5

0.0001 liblinear l2
0.5473903694533133 0.5032200755052187

0.0001 saga l1
0.5 0.5

0.0001 saga l2
0.6322212810321235 0.49888962913613144

0.001 liblinear l1
0.5 0.5

0.001 liblinear l2
0.5704632827262658 0.5051632245169887

0.001 saga l1
0.5 0.5

0.001 saga l2
0.6332442822701505 0.49905618476571173

0.01 liblinear l1
0.5 0.5

0.01 liblinear l2
0.6314198214634782 0.49783477681545635

0.01 saga l1
0.5 0.5

0.01 saga l2
0.6415586108034145 0.49727959138352207

0.1 liblinear l1
0.5947986577181208 0.5269542527204086

0.1 liblinear l2
0.698227666644947 0.49744614701310236

0.1 saga l1
0.5916889294324623 0.5195980457472796

0.1 saga l2
0.6976412328142308 0.4970575172107484

1 liblinear l1
0.6431224343519907 0.4942260715078836

1 liblinear l2
0.9606372580960448 0.48817455029980006

1 saga l1
0.6429074086140614 0.49444814568065737

1 saga l2
0.9606307421645924 0.48811903175660665

10 liblinear l1
0.9999478725483808 0.47984676882078614

10 liblinear l2
0.9996644295302

In [48]:
# Look at feature importance for the tuned model
feature_importance = pd.DataFrame({
    'feature': dv.feature_names_,
    'importance': best_model.coef_[0],
    'abs_importance': np.abs(best_model.coef_[0])
})
feature_importance = feature_importance.sort_values('abs_importance', ascending=False)

print('\nTop 10 most important features with tuned model:')
print(feature_importance.head(10))


Top 10 most important features with tuned model:
                                               feature  importance  \
18                                lead_source=Referral   -0.129681   
28         organization_type=Individual Course Creator   -0.120831   
13                         expected_student_count=<100    0.118894   
22                           organization_size=201-500   -0.070814   
24                            organization_size=51-200    0.063279   
31                   primary_use_case=Academic Courses    0.060927   
8                    decision_timeframe=Within 1 month    0.055592   
5                        decision_timeframe=3-6 months   -0.006193   
12                     expected_student_count=501-1000   -0.001572   
869  required_features=Course Creation, Virtual Cla...    0.000000   

     abs_importance  
18         0.129681  
28         0.120831  
13         0.118894  
22         0.070814  
24         0.063279  
31         0.060927  
8          0.055592  
5  

In [51]:
feature_names = dv.feature_names_

In [52]:
# Get all features related to lead_source
lead_source_features = [f for f in feature_names if 'lead_source=' in f]

# Get their coefficients
lead_source_coef = pd.DataFrame({
    'feature': lead_source_features,
    'coefficient': best_model.coef_[0][np.where([f in lead_source_features for f in feature_names])[0]]
})

# Sort by coefficient value to see most positive to most negative impact
lead_source_coef = lead_source_coef.sort_values('coefficient', ascending=False)

# Strip the 'lead_source=' prefix for cleaner display
lead_source_coef['source'] = lead_source_coef['feature'].str.replace('lead_source=', '')

print("Lead Source Impact on Conversion (sorted by coefficient):")
print(lead_source_coef[['source', 'coefficient']])

# You might also want to see the distribution of lead sources in your data
print("\nLead Source Distribution in Training Data:")
print(df_train['lead_source'].value_counts())

Lead Source Impact on Conversion (sorted by coefficient):
                 source  coefficient
0        Direct Traffic     0.000000
1  Education Conference     0.000000
2         Google Search     0.000000
3   Product Review Site     0.000000
5          Social Media     0.000000
4              Referral    -0.129681

Lead Source Distribution in Training Data:
lead_source
Education Conference    326
Product Review Site     278
Referral                212
Social Media            182
Google Search           181
Direct Traffic          149
Name: count, dtype: int64
