In [1]:
from datetime import datetime, timedelta
from collections import defaultdict

import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_users = pd.read_csv('../data/random-users.csv')

In [4]:
df_logs = pd.read_csv('../data/random-logs.csv')

In [5]:
def preprocess_required_features(df):
    df = df.copy()
    
    def process_feature_string(feature_string):

        features = [f.strip() for f in feature_string.split(',')]
        return {f'required_feature_{feature}': 1 for feature in features}
    
    feature_dicts = df['required_features'].apply(process_feature_string)
    df = df.drop('required_features', axis=1)
    record_dicts = df.to_dict('records')
    
    for record, feature_dict in zip(record_dicts, feature_dicts):
        record.update(feature_dict)
    
    return record_dicts

In [6]:
def create_log_features(df_users, df_logs, cutoff_date):
    df_logs_filtered = df_logs[df_logs['timestamp'] < cutoff_date].copy()
    
    engagement_metrics = df_logs_filtered.groupby('user_id').agg({
        'timestamp': ['count', 'nunique'],  # Total actions and unique days
        'duration_seconds': ['sum', 'mean', 'std']  # Time spent metrics
    }).round(2)
    
    engagement_metrics.columns = [
        'total_actions',
        'active_days',
        'total_duration',
        'avg_duration',
        'std_duration'
    ]
    
    # Action category distribution
    category_counts = df_logs_filtered.groupby(['user_id', 'action_category']).size().unstack(
        fill_value=0
    ).add_prefix('category_')
    
    # Action type distribution (top 10 most common)
    top_actions = df_logs_filtered['action_type'].value_counts().nlargest(10).index
    action_counts = df_logs_filtered[df_logs_filtered['action_type'].isin(top_actions)]\
        .groupby(['user_id', 'action_type']).size().unstack(fill_value=0).add_prefix('action_')
    
    # Time-based features
    df_logs_filtered['hour'] = df_logs_filtered['timestamp'].dt.hour
    time_metrics = df_logs_filtered.groupby('user_id').agg({
        'hour': lambda x: len(x[x.between(9, 17)]) / len(x)  # Fraction of activity during business hours
    }).round(2)
    time_metrics.columns = ['business_hours_ratio']
    
    # Activity patterns
    df_logs_filtered['days_since_signup'] = (
        df_logs_filtered['timestamp'] - 
        df_logs_filtered['user_id'].map(df_users.set_index('user_id')['signup_date'])
    ).dt.days
    
    recency_metrics = df_logs_filtered.groupby('user_id').agg({
        'days_since_signup': ['min', 'max']
    }).round(2)
    recency_metrics.columns = ['days_to_first_action', 'days_to_last_action']
    
    # Advanced engagement metrics
    df_logs_filtered['prev_timestamp'] = df_logs_filtered.groupby('user_id')['timestamp'].shift(1)
    df_logs_filtered['time_between_actions'] = (
        df_logs_filtered['timestamp'] - df_logs_filtered['prev_timestamp']
    ).dt.total_seconds() / 3600  # Convert to hours
    
    engagement_patterns = df_logs_filtered.groupby('user_id').agg({
        'time_between_actions': ['mean', 'std']
    }).round(2)
    engagement_patterns.columns = ['avg_hours_between_actions', 'std_hours_between_actions']
    
    # Feature importance indicators
    feature_exploration = df_logs_filtered[
        df_logs_filtered['action_type'] == 'view_features'
    ].groupby('user_id').size().to_frame('feature_views')
    
    # Combine all features
    log_features = pd.concat([
        engagement_metrics,
        category_counts,
        action_counts,
        time_metrics,
        recency_metrics,
        engagement_patterns,
        feature_exploration
    ], axis=1).reset_index()
    
    # Fill NaN values with 0 for new users or users with missing metrics
    log_features = log_features.fillna(0)
    
    return log_features


def prepare_lead_scoring_data(df_users, df_logs, train_end_date, val_end_date):
    df_users = df_users.copy()
    df_logs = df_logs.copy()
    
    df_users['signup_date'] = pd.to_datetime(df_users['signup_date'])
    df_logs['timestamp'] = pd.to_datetime(df_logs['timestamp'])
    
    train_end_date = pd.to_datetime(train_end_date)
    val_end_date = pd.to_datetime(val_end_date)
    
    train_mask = df_users['signup_date'] < train_end_date
    val_mask = (df_users['signup_date'] >= train_end_date) & (df_users['signup_date'] < val_end_date)
    test_mask = df_users['signup_date'] >= val_end_date
    
    df_train = df_users[train_mask].copy()
    df_val = df_users[val_mask].copy()
    df_test = df_users[test_mask].copy()
    
    train_features = create_log_features(df_users, df_logs, train_end_date)
    val_features = create_log_features(df_users, df_logs, val_end_date)
    test_features = create_log_features(df_users, df_logs, df_logs['timestamp'].max())

    df_train = df_train.merge(train_features, on='user_id', how='left')
    df_val = df_val.merge(val_features, on='user_id', how='left')
    df_test = df_test.merge(test_features, on='user_id', how='left')

    return df_train, df_val, df_test

In [7]:
df_train, df_val, df_test = prepare_lead_scoring_data(
    df_users,
    df_logs,
    train_end_date='2024-03-01',
    val_end_date='2024-03-15'
)

In [8]:
from sklearn.feature_extraction import DictVectorizer

def prepare_features(df, dv=None, fit=True):

    df = df.copy()
    
    date_columns = ['signup_date', 'conversion_date']
    exclude_columns = ['user_id', 'converted'] + date_columns
    
    df = df.drop(columns=exclude_columns)
    df = df.fillna(0)
    feature_dict = preprocess_required_features(df)
    
    if dv is None:
        dv = DictVectorizer(sparse=True)
    
    if fit:
        X = dv.fit_transform(feature_dict)
    else:
        X = dv.transform(feature_dict)
    
    return X, dv

X_train, dv = prepare_features(df_train, fit=True)
X_val, _ = prepare_features(df_val, dv=dv, fit=False)
X_test, _ = prepare_features(df_test, dv=dv, fit=False)

y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [13]:
import mlflow

# Set up MLflow - this will create an 'mlruns' directory in your current directory
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("logistic_regression_tuning")

2024/11/14 13:36:10 INFO mlflow.tracking.fluent: Experiment with name 'logistic_regression_tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/lld-lead-scoring/analytics/mlruns/1', creation_time=1731591370971, experiment_id='1', last_update_time=1731591370971, lifecycle_stage='active', name='logistic_regression_tuning', tags={}>

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

In [23]:
from sklearn.pipeline import Pipeline

In [25]:
results = []

for C in param_grid['C']:
    for solver in param_grid['solver']:
        for penalty in param_grid['penalty']:
            # Skip invalid combinations
            if solver == 'liblinear' and penalty not in ['l1', 'l2']:
                continue
                
            # Start an MLflow run
            with mlflow.start_run():
                # Log parameters
                mlflow.log_param("C", C)
                mlflow.log_param("solver", solver)
                mlflow.log_param("penalty", penalty)
                
                # Train model
                model = LogisticRegression(
                    C=C, 
                    solver=solver,
                    penalty=penalty,
                    random_state=1,
                    max_iter=1000
                )
                model.fit(X_train, y_train)
                
                # Get predictions
                train_pred = model.predict_proba(X_train)[:, 1]
                val_pred = model.predict_proba(X_val)[:, 1]
                
                # Calculate AUC
                train_auc = roc_auc_score(y_train, train_pred)
                val_auc = roc_auc_score(y_val, val_pred)
                
                # Log metrics
                mlflow.log_metric("train_auc", train_auc)
                mlflow.log_metric("val_auc", val_auc)
                
                # Log model
                # mlflow.sklearn.log_model(model, "model")
                pipeline = Pipeline([
                    ('vectorizer', dv),
                    ('model', best_model)
                ])

                mlflow.sklearn.log_model(pipeline, "model")
                
                print(f"Run with C={C}, solver={solver}, penalty={penalty}:")
                print(f"Train AUC: {train_auc:.3f}, Val AUC: {val_auc:.3f}\n")

                results.append({
                    'C': C,
                    'solver': solver,
                    'penalty': penalty,
                    'train_auc': train_auc,
                    'val_auc': val_auc
                })

2024/11/14 13:47:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run inquisitive-wolf-385 at: http://localhost:5000/#/experiments/1/runs/f46523ad63cb4de9a6a996813013d061.
2024/11/14 13:47:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=liblinear, penalty=l1:
Train AUC: 0.389, Val AUC: 0.408



2024/11/14 13:47:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run beautiful-deer-122 at: http://localhost:5000/#/experiments/1/runs/81984db390294b618cef2b974ae7dec9.
2024/11/14 13:47:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=liblinear, penalty=l2:
Train AUC: 0.643, Val AUC: 0.566



2024/11/14 13:47:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run marvelous-hound-690 at: http://localhost:5000/#/experiments/1/runs/9f59ee38ced4422c91ad0d802e3c8ba8.
2024/11/14 13:47:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=saga, penalty=l1:
Train AUC: 0.389, Val AUC: 0.409



2024/11/14 13:47:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run inquisitive-stag-978 at: http://localhost:5000/#/experiments/1/runs/0f12198f4b2e4cd4b216774eaa296473.
2024/11/14 13:47:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=saga, penalty=l2:
Train AUC: 0.547, Val AUC: 0.441



2024/11/14 13:47:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run persistent-calf-322 at: http://localhost:5000/#/experiments/1/runs/ce2166f9af9e44c0a31945b08f77327c.
2024/11/14 13:47:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=liblinear, penalty=l1:
Train AUC: 0.494, Val AUC: 0.429



2024/11/14 13:47:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run carefree-sponge-649 at: http://localhost:5000/#/experiments/1/runs/b2fc52ba2d3d4d4c8bc0057f5e52ddd0.
2024/11/14 13:47:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=liblinear, penalty=l2:
Train AUC: 0.656, Val AUC: 0.592



2024/11/14 13:47:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run industrious-mole-538 at: http://localhost:5000/#/experiments/1/runs/ad88bf34d57f472f8dff1073da4ae8e4.
2024/11/14 13:47:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=saga, penalty=l1:
Train AUC: 0.489, Val AUC: 0.426



2024/11/14 13:47:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run skillful-conch-147 at: http://localhost:5000/#/experiments/1/runs/698723fc53954c879d5face8254b7e7d.
2024/11/14 13:47:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/14 13:47:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-slug-610 at: http://localhost:5000/#/experiments/1/runs/a455958d26a84e08b9a779bc3fb8582f.
2024/11/14 13:47:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=liblinear, penalty=l1:
Train AUC: 0.645, Val AUC: 0.583



2024/11/14 13:47:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-hare-976 at: http://localhost:5000/#/experiments/1/runs/41b6af2ed4884e71b65651b4939efe7e.
2024/11/14 13:47:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=liblinear, penalty=l2:
Train AUC: 0.685, Val AUC: 0.586



2024/11/14 13:47:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run skittish-shad-286 at: http://localhost:5000/#/experiments/1/runs/a535da4aa6f448938ddfada8341250dc.
2024/11/14 13:47:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=saga, penalty=l1:
Train AUC: 0.543, Val AUC: 0.440



2024/11/14 13:47:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-moth-702 at: http://localhost:5000/#/experiments/1/runs/592d4ee625674fd385097608f2ed78e9.
2024/11/14 13:47:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/14 13:47:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run receptive-trout-894 at: http://localhost:5000/#/experiments/1/runs/e7d523d1725f4895a907dc7bcd6ed4ae.
2024/11/14 13:47:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=liblinear, penalty=l1:
Train AUC: 0.674, Val AUC: 0.603



2024/11/14 13:47:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run chill-grub-894 at: http://localhost:5000/#/experiments/1/runs/8fd02fb421c741659b0c1ad32fe92497.
2024/11/14 13:47:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=liblinear, penalty=l2:
Train AUC: 0.712, Val AUC: 0.560



2024/11/14 13:47:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run righteous-cow-427 at: http://localhost:5000/#/experiments/1/runs/6fe32eab45e745c293200f5c7e4d3ae8.
2024/11/14 13:47:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=saga, penalty=l1:
Train AUC: 0.547, Val AUC: 0.440



2024/11/14 13:47:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run casual-pig-683 at: http://localhost:5000/#/experiments/1/runs/9b79d91897b1447b872440d490c510a7.
2024/11/14 13:47:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/14 13:47:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run big-crab-922 at: http://localhost:5000/#/experiments/1/runs/ff7adae067424641afdba788ae24a504.
2024/11/14 13:47:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=liblinear, penalty=l1:
Train AUC: 0.720, Val AUC: 0.570



2024/11/14 13:47:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run peaceful-eel-268 at: http://localhost:5000/#/experiments/1/runs/01c6d3a1c906455f8b48d98e4f931471.
2024/11/14 13:47:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=liblinear, penalty=l2:
Train AUC: 0.719, Val AUC: 0.561



2024/11/14 13:48:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run dazzling-perch-42 at: http://localhost:5000/#/experiments/1/runs/e9d5235d685c49e8993830b8f7e211ca.
2024/11/14 13:48:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=saga, penalty=l1:
Train AUC: 0.548, Val AUC: 0.441



2024/11/14 13:48:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run funny-eel-599 at: http://localhost:5000/#/experiments/1/runs/6df37abcfd5f4b2daef785560e435f7c.
2024/11/14 13:48:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/14 13:48:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run unique-sloth-163 at: http://localhost:5000/#/experiments/1/runs/ef6b6a8eaf79449588a21890fd30bda4.
2024/11/14 13:48:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=liblinear, penalty=l1:
Train AUC: 0.722, Val AUC: 0.574



2024/11/14 13:48:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run rumbling-sloth-461 at: http://localhost:5000/#/experiments/1/runs/21df5844f7c845529d2b8daeee28005a.
2024/11/14 13:48:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=liblinear, penalty=l2:
Train AUC: 0.720, Val AUC: 0.560



2024/11/14 13:48:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-pug-735 at: http://localhost:5000/#/experiments/1/runs/8261a2640f43424a8f47926f4b928689.
2024/11/14 13:48:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=saga, penalty=l1:
Train AUC: 0.548, Val AUC: 0.441



2024/11/14 13:48:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run serious-doe-710 at: http://localhost:5000/#/experiments/1/runs/d8d8249b28df4950aa8839329c6df2f3.
2024/11/14 13:48:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



In [18]:
# Convert results to DataFrame for easy analysis
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('val_auc', ascending=False)

print("Top 5 models by validation AUC:")
print(results_df.head())

Top 5 models by validation AUC:
         C     solver penalty  train_auc   val_auc
12   0.100  liblinear      l1   0.674355  0.603015
5    0.001  liblinear      l2   0.656337  0.592494
9    0.010  liblinear      l2   0.685496  0.586387
8    0.010  liblinear      l1   0.645429  0.582667
20  10.000  liblinear      l1   0.721949  0.574228


In [19]:
# Get best parameters
best_params = results_df.iloc[0]
print("\nBest parameters:")
print(f"C={best_params['C']}, solver={best_params['solver']}, penalty={best_params['penalty']}")
print(f"Validation AUC: {best_params['val_auc']:.3f}")

# Train final model with best parameters
with mlflow.start_run(run_name="best_model"):
    best_model = LogisticRegression(
        C=best_params['C'],
        solver=best_params['solver'],
        penalty=best_params['penalty'],
        random_state=1,
        max_iter=1000
    )
    best_model.fit(X_train, y_train)
    
    # Log best parameters
    mlflow.log_params({
        "C": best_params['C'],
        "solver": best_params['solver'],
        "penalty": best_params['penalty']
    })
    
    # Log final metrics
    final_train_pred = best_model.predict_proba(X_train)[:, 1]
    final_val_pred = best_model.predict_proba(X_val)[:, 1]
    final_train_auc = roc_auc_score(y_train, final_train_pred)
    final_val_auc = roc_auc_score(y_val, final_val_pred)
    
    mlflow.log_metrics({
        "final_train_auc": final_train_auc,
        "final_val_auc": final_val_auc
    })
    
    # Log the final model
    mlflow.sklearn.log_model(best_model, "final_model")


Best parameters:
C=0.1, solver=liblinear, penalty=l1
Validation AUC: 0.603


2024/11/14 13:38:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_model at: http://localhost:5000/#/experiments/1/runs/0fa661d274634f378a8dc4d431f366a5.
2024/11/14 13:38:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


In [17]:
# Look at feature importance for the tuned model
feature_importance = pd.DataFrame({
    'feature': dv.feature_names_,
    'importance': best_model.coef_[0],
    'abs_importance': np.abs(best_model.coef_[0])
})
feature_importance = feature_importance.sort_values('abs_importance', ascending=False)

print('\nTop 10 most important features with tuned model:')
print(feature_importance.head(10))


Top 10 most important features with tuned model:
                               feature  importance  abs_importance
38         expected_student_count=<100    0.129421        0.129421
44                lead_source=Referral   -0.127615        0.127615
41    lead_source=Education Conference   -0.121131        0.121131
17            category_course_creation    0.089542        0.089542
60  primary_use_case=Employee Training   -0.084075        0.084075
50            organization_size=51-200    0.072476        0.072476
20                    category_support    0.061360        0.061360
2                 action_create_course    0.059987        0.059987
4                  action_invite_users    0.058763        0.058763
12           avg_hours_between_actions   -0.056850        0.056850


In [18]:
feature_names = dv.feature_names_

In [19]:
# Get all features related to lead_source
lead_source_features = [f for f in feature_names if 'lead_source=' in f]

# Get their coefficients
lead_source_coef = pd.DataFrame({
    'feature': lead_source_features,
    'coefficient': best_model.coef_[0][np.where([f in lead_source_features for f in feature_names])[0]]
})

# Sort by coefficient value to see most positive to most negative impact
lead_source_coef = lead_source_coef.sort_values('coefficient', ascending=False)

# Strip the 'lead_source=' prefix for cleaner display
lead_source_coef['source'] = lead_source_coef['feature'].str.replace('lead_source=', '')

print("Lead Source Impact on Conversion (sorted by coefficient):")
print(lead_source_coef[['source', 'coefficient']])

# You might also want to see the distribution of lead sources in your data
print("\nLead Source Distribution in Training Data:")
print(df_train['lead_source'].value_counts())

Lead Source Impact on Conversion (sorted by coefficient):
                 source  coefficient
0        Direct Traffic     0.000000
2         Google Search     0.000000
5          Social Media     0.000000
3   Product Review Site     0.000000
1  Education Conference    -0.121131
4              Referral    -0.127615

Lead Source Distribution in Training Data:
lead_source
Education Conference    326
Product Review Site     278
Referral                212
Social Media            182
Google Search           181
Direct Traffic          149
Name: count, dtype: int64
