In [1]:
from datetime import datetime, timedelta
from collections import defaultdict

import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, make_pipeline

In [3]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("logistic_regression_tuning")

<Experiment: artifact_location='/workspaces/lld-lead-scoring/analytics/mlruns/1', creation_time=1731591370971, experiment_id='1', last_update_time=1731591370971, lifecycle_stage='active', name='logistic_regression_tuning', tags={}>

In [4]:
df_users = pd.read_csv('../data/random-users.csv')
df_logs = pd.read_csv('../data/random-logs.csv')

In [5]:
def preprocess_required_features(df):
    df = df.copy()
    
    def process_feature_string(feature_string):
        features = [f.strip() for f in feature_string.split(',')]
        return {f'required_feature_{feature}': 1 for feature in features}
    
    feature_dicts = df['required_features'].apply(process_feature_string)
    df = df.drop('required_features', axis=1)
    record_dicts = df.to_dict('records')
    
    for record, feature_dict in zip(record_dicts, feature_dicts):
        record.update(feature_dict)
    
    return record_dicts

In [9]:
def create_log_features(df_users, df_logs, cutoff_date):
    df_logs_filtered = df_logs[df_logs['timestamp'] < cutoff_date].copy()
    
    engagement_metrics = df_logs_filtered.groupby('user_id').agg({
        'timestamp': ['count', 'nunique'],  # Total actions and unique days
        'duration_seconds': ['sum', 'mean', 'std']  # Time spent metrics
    }).round(2)
    
    engagement_metrics.columns = [
        'total_actions',
        'active_days',
        'total_duration',
        'avg_duration',
        'std_duration'
    ]
    
    # Action category distribution
    category_counts = df_logs_filtered.groupby(['user_id', 'action_category']).size().unstack(
        fill_value=0
    ).add_prefix('category_')
    
    # Action type distribution (top 10 most common)
    top_actions = df_logs_filtered['action_type'].value_counts().nlargest(10).index
    action_counts = df_logs_filtered[df_logs_filtered['action_type'].isin(top_actions)]\
        .groupby(['user_id', 'action_type']).size().unstack(fill_value=0).add_prefix('action_')
    
    # Time-based features
    df_logs_filtered['hour'] = df_logs_filtered['timestamp'].dt.hour
    time_metrics = df_logs_filtered.groupby('user_id').agg({
        'hour': lambda x: len(x[x.between(9, 17)]) / len(x)  # Fraction of activity during business hours
    }).round(2)
    time_metrics.columns = ['business_hours_ratio']
    
    # Activity patterns
    df_logs_filtered['days_since_signup'] = (
        df_logs_filtered['timestamp'] - 
        df_logs_filtered['user_id'].map(df_users.set_index('user_id')['signup_date'])
    ).dt.days
    
    recency_metrics = df_logs_filtered.groupby('user_id').agg({
        'days_since_signup': ['min', 'max']
    }).round(2)
    recency_metrics.columns = ['days_to_first_action', 'days_to_last_action']
    
    # Advanced engagement metrics
    df_logs_filtered['prev_timestamp'] = df_logs_filtered.groupby('user_id')['timestamp'].shift(1)
    df_logs_filtered['time_between_actions'] = (
        df_logs_filtered['timestamp'] - df_logs_filtered['prev_timestamp']
    ).dt.total_seconds() / 3600  # Convert to hours
    
    engagement_patterns = df_logs_filtered.groupby('user_id').agg({
        'time_between_actions': ['mean', 'std']
    }).round(2)
    engagement_patterns.columns = ['avg_hours_between_actions', 'std_hours_between_actions']
    
    # Feature importance indicators
    feature_exploration = df_logs_filtered[
        df_logs_filtered['action_type'] == 'view_features'
    ].groupby('user_id').size().to_frame('feature_views')
    
    # Combine all features
    log_features = pd.concat([
        engagement_metrics,
        category_counts,
        action_counts,
        time_metrics,
        recency_metrics,
        engagement_patterns,
        feature_exploration
    ], axis=1).reset_index()
    
    # Fill NaN values with 0 for new users or users with missing metrics
    log_features = log_features.fillna(0)
    
    return log_features


def prepare_lead_scoring_data(df_users, df_logs, train_end_date, val_end_date):
    df_users = df_users.copy()
    df_logs = df_logs.copy()
    
    df_users['signup_date'] = pd.to_datetime(df_users['signup_date'])
    df_logs['timestamp'] = pd.to_datetime(df_logs['timestamp'])
    
    train_end_date = pd.to_datetime(train_end_date)
    val_end_date = pd.to_datetime(val_end_date)
    
    train_mask = df_users['signup_date'] < train_end_date
    val_mask = (df_users['signup_date'] >= train_end_date) & (df_users['signup_date'] < val_end_date)
    test_mask = df_users['signup_date'] >= val_end_date
    
    df_train = df_users[train_mask].copy()
    df_val = df_users[val_mask].copy()
    df_test = df_users[test_mask].copy()
    
    train_features = create_log_features(df_users, df_logs, train_end_date)
    val_features = create_log_features(df_users, df_logs, val_end_date)
    test_features = create_log_features(df_users, df_logs, df_logs['timestamp'].max())

    df_train = df_train.merge(train_features, on='user_id', how='left')
    df_val = df_val.merge(val_features, on='user_id', how='left')
    df_test = df_test.merge(test_features, on='user_id', how='left')

    return df_train, df_val, df_test

In [10]:
df_train, df_val, df_test = prepare_lead_scoring_data(
    df_users,
    df_logs,
    train_end_date='2024-03-01',
    val_end_date='2024-03-15'
)

In [12]:
def prepare_features(df):
    df = df.copy()
    
    date_columns = ['signup_date', 'conversion_date']
    exclude_columns = ['user_id', 'converted'] + date_columns

    df = df.drop(columns=exclude_columns)
    df = df.fillna(0)
    feature_dict = preprocess_required_features(df)

    return feature_dict

train_dicts = prepare_features(df_train)
val_dicts = prepare_features(df_val)
test_dicts = prepare_features(df_test)

y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [13]:
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

In [14]:
results = []

for C in param_grid['C']:
    for solver in param_grid['solver']:
        for penalty in param_grid['penalty']:
            # Skip invalid combinations
            if solver == 'liblinear' and penalty not in ['l1', 'l2']:
                continue

            # Start an MLflow run
            with mlflow.start_run():
                # Log parameters
                mlflow.log_param("C", C)
                mlflow.log_param("solver", solver)
                mlflow.log_param("penalty", penalty)

                pipeline = make_pipeline(
                    DictVectorizer(),
                    LogisticRegression(
                        C=C, 
                        solver=solver,
                        penalty=penalty,
                        random_state=1,
                        max_iter=1000
                    )
                )
    
                pipeline.fit(train_dicts, y_train)

                train_pred = pipeline.predict_proba(train_dicts)[:, 1]
                val_pred = pipeline.predict_proba(val_dicts)[:, 1]

                train_auc = roc_auc_score(y_train, train_pred)
                val_auc = roc_auc_score(y_val, val_pred)
                
                mlflow.log_metric("train_auc", train_auc)
                mlflow.log_metric("val_auc", val_auc)

                mlflow.sklearn.log_model(pipeline, "model")

                print(f"Run with C={C}, solver={solver}, penalty={penalty}:")
                print(f"Train AUC: {train_auc:.3f}, Val AUC: {val_auc:.3f}\n")

                results.append({
                    'C': C,
                    'solver': solver,
                    'penalty': penalty,
                    'train_auc': train_auc,
                    'val_auc': val_auc
                })

2024/11/15 10:01:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run rumbling-gnat-213 at: http://localhost:5000/#/experiments/1/runs/69f0a835de96443a8ef740f2a74f1e27.
2024/11/15 10:01:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=liblinear, penalty=l1:
Train AUC: 0.389, Val AUC: 0.408



2024/11/15 10:01:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-conch-5 at: http://localhost:5000/#/experiments/1/runs/0482ea257ab64a878eb6e83d5c707454.
2024/11/15 10:01:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=liblinear, penalty=l2:
Train AUC: 0.643, Val AUC: 0.566



2024/11/15 10:01:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run victorious-kit-657 at: http://localhost:5000/#/experiments/1/runs/9c53906e760145a58b734c0678137e42.
2024/11/15 10:01:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=saga, penalty=l1:
Train AUC: 0.389, Val AUC: 0.409



2024/11/15 10:01:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run efficient-pug-388 at: http://localhost:5000/#/experiments/1/runs/03e0784c1b014dfe8ef2bdbbb7b23090.
2024/11/15 10:01:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.0001, solver=saga, penalty=l2:
Train AUC: 0.547, Val AUC: 0.441



2024/11/15 10:01:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run able-gnu-877 at: http://localhost:5000/#/experiments/1/runs/639a5b7591524bab94c88bae2be2f3c6.
2024/11/15 10:01:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=liblinear, penalty=l1:
Train AUC: 0.494, Val AUC: 0.429



2024/11/15 10:01:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run ambitious-sloth-849 at: http://localhost:5000/#/experiments/1/runs/b0cb96df4eb3483e8fd129a598016929.
2024/11/15 10:01:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=liblinear, penalty=l2:
Train AUC: 0.656, Val AUC: 0.592



2024/11/15 10:01:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run inquisitive-wren-906 at: http://localhost:5000/#/experiments/1/runs/a9137e71ef8942929fd7164758ea174a.
2024/11/15 10:01:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=saga, penalty=l1:
Train AUC: 0.489, Val AUC: 0.426



2024/11/15 10:01:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run selective-snipe-162 at: http://localhost:5000/#/experiments/1/runs/cec281121b474674857c52269e701204.
2024/11/15 10:01:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.001, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/15 10:01:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run adaptable-fowl-508 at: http://localhost:5000/#/experiments/1/runs/b3fc00c84d29458a9fa52d2a3fc0f101.
2024/11/15 10:01:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=liblinear, penalty=l1:
Train AUC: 0.645, Val AUC: 0.583



2024/11/15 10:01:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run awesome-shoat-336 at: http://localhost:5000/#/experiments/1/runs/522ccae62d6149dfb07ede87e0bbe24b.
2024/11/15 10:01:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=liblinear, penalty=l2:
Train AUC: 0.685, Val AUC: 0.586



2024/11/15 10:01:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-snail-388 at: http://localhost:5000/#/experiments/1/runs/165e8c8695cc47c7aab67df8f022f9d1.
2024/11/15 10:01:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=saga, penalty=l1:
Train AUC: 0.543, Val AUC: 0.440



2024/11/15 10:01:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run capricious-asp-184 at: http://localhost:5000/#/experiments/1/runs/2edaf239abd14ee4b73f1a31ba62395a.
2024/11/15 10:01:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.01, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/15 10:01:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run fortunate-donkey-928 at: http://localhost:5000/#/experiments/1/runs/5d59c84332644f7cb6316a9494430064.
2024/11/15 10:01:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=liblinear, penalty=l1:
Train AUC: 0.674, Val AUC: 0.603



2024/11/15 10:01:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run incongruous-mole-345 at: http://localhost:5000/#/experiments/1/runs/31d1dec3dfed4c9bb73308560d4dfde1.
2024/11/15 10:01:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=liblinear, penalty=l2:
Train AUC: 0.712, Val AUC: 0.560



2024/11/15 10:01:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run selective-perch-798 at: http://localhost:5000/#/experiments/1/runs/3dcfddba1b2c469aa2d21da83464bd78.
2024/11/15 10:01:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=saga, penalty=l1:
Train AUC: 0.547, Val AUC: 0.440



2024/11/15 10:01:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run bemused-bug-509 at: http://localhost:5000/#/experiments/1/runs/e183c7147e154dd1a759ab8864b982d0.
2024/11/15 10:01:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=0.1, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/15 10:01:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run rogue-bug-962 at: http://localhost:5000/#/experiments/1/runs/8b52f9287c5542aabd18ee0c4c6be34f.
2024/11/15 10:01:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=liblinear, penalty=l1:
Train AUC: 0.720, Val AUC: 0.570



2024/11/15 10:01:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run sneaky-snake-448 at: http://localhost:5000/#/experiments/1/runs/8dd3a96fcabe4b38b3b09ffc3fb679bc.
2024/11/15 10:01:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=liblinear, penalty=l2:
Train AUC: 0.719, Val AUC: 0.561



2024/11/15 10:01:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run unruly-bee-251 at: http://localhost:5000/#/experiments/1/runs/040bb922defa4a31a6ed5b493e29f8bd.
2024/11/15 10:01:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=saga, penalty=l1:
Train AUC: 0.548, Val AUC: 0.441



2024/11/15 10:01:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run upbeat-ram-541 at: http://localhost:5000/#/experiments/1/runs/0926c0d33629414ab74d3a6a8ab3f4ca.
2024/11/15 10:01:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=1, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



2024/11/15 10:01:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run redolent-midge-548 at: http://localhost:5000/#/experiments/1/runs/c8a1c4a5054d4ba8bf543dca6e48fb12.
2024/11/15 10:01:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=liblinear, penalty=l1:
Train AUC: 0.722, Val AUC: 0.574



2024/11/15 10:01:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run masked-worm-572 at: http://localhost:5000/#/experiments/1/runs/0e7c362154654de2b25faa7d39fbb74f.
2024/11/15 10:01:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=liblinear, penalty=l2:
Train AUC: 0.720, Val AUC: 0.560



2024/11/15 10:01:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run secretive-worm-634 at: http://localhost:5000/#/experiments/1/runs/1bbb00ebc12749ddb1e22383ad3add6a.
2024/11/15 10:01:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=saga, penalty=l1:
Train AUC: 0.548, Val AUC: 0.441



2024/11/15 10:01:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run stylish-crab-90 at: http://localhost:5000/#/experiments/1/runs/5cd07dd777d940069d4fa152ef3fd068.
2024/11/15 10:01:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Run with C=10, solver=saga, penalty=l2:
Train AUC: 0.548, Val AUC: 0.441



In [20]:
# Convert results to DataFrame for easy analysis
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('val_auc', ascending=False)

print("Top 5 models by validation AUC:")
print(results_df.head())

Top 5 models by validation AUC:
         C     solver penalty  train_auc   val_auc
12   0.100  liblinear      l1   0.674355  0.603015
5    0.001  liblinear      l2   0.656337  0.592494
9    0.010  liblinear      l2   0.685496  0.586387
8    0.010  liblinear      l1   0.645429  0.582667
20  10.000  liblinear      l1   0.721949  0.574228


In [21]:
best_params = results_df.iloc[0]
print("\nBest parameters:")
print(f"C={best_params['C']}, solver={best_params['solver']}, penalty={best_params['penalty']}")
print(f"Validation AUC: {best_params['val_auc']:.3f}")

with mlflow.start_run(run_name="best_model"):
    pipeline = make_pipeline(
        DictVectorizer(),
        LogisticRegression(
            C=best_params['C'],
            solver=best_params['solver'],
            penalty=best_params['penalty'],
            random_state=1,
            max_iter=1000
        )
    )

    pipeline.fit(train_dicts, y_train)
    
    mlflow.log_params({
        "C": best_params['C'],
        "solver": best_params['solver'],
        "penalty": best_params['penalty']
    })

    final_train_pred = pipeline.predict_proba(train_dicts)[:, 1]
    final_val_pred = pipeline.predict_proba(val_dicts)[:, 1]
    final_train_auc = roc_auc_score(y_train, final_train_pred)
    final_val_auc = roc_auc_score(y_val, final_val_pred)
    
    mlflow.log_metrics({
        "train_auc": final_train_auc,
        "val_auc": final_val_auc
    })

    mlflow.sklearn.log_model(pipeline, "final_model")


Best parameters:
C=0.1, solver=liblinear, penalty=l1
Validation AUC: 0.603


2024/11/15 10:08:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_model at: http://localhost:5000/#/experiments/1/runs/1190ba2e6a0e4214899d826b84a8ba98.
2024/11/15 10:08:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


In [22]:
# Look at feature importance for the tuned model
feature_importance = pd.DataFrame({
    'feature': pipeline[0].feature_names_,
    'importance': pipeline[1].coef_[0],
    'abs_importance': np.abs(pipeline[1].coef_[0])
})
feature_importance = feature_importance.sort_values('abs_importance', ascending=False)

print('\nTop 10 most important features with tuned model:')
print(feature_importance.head(10))


Top 10 most important features with tuned model:
                               feature  importance  abs_importance
38         expected_student_count=<100    0.129421        0.129421
44                lead_source=Referral   -0.127615        0.127615
41    lead_source=Education Conference   -0.121131        0.121131
17            category_course_creation    0.089542        0.089542
60  primary_use_case=Employee Training   -0.084075        0.084075
50            organization_size=51-200    0.072476        0.072476
20                    category_support    0.061360        0.061360
2                 action_create_course    0.059987        0.059987
4                  action_invite_users    0.058763        0.058763
12           avg_hours_between_actions   -0.056850        0.056850


In [24]:
feature_names = pipeline[0].feature_names_

In [26]:
# Get all features related to lead_source
lead_source_features = [f for f in feature_names if 'lead_source=' in f]

# Get their coefficients
lead_source_coef = pd.DataFrame({
    'feature': lead_source_features,
    'coefficient': pipeline[1].coef_[0][np.where([f in lead_source_features for f in feature_names])[0]]
})

# Sort by coefficient value to see most positive to most negative impact
lead_source_coef = lead_source_coef.sort_values('coefficient', ascending=False)

# Strip the 'lead_source=' prefix for cleaner display
lead_source_coef['source'] = lead_source_coef['feature'].str.replace('lead_source=', '')

print("Lead Source Impact on Conversion (sorted by coefficient):")
print(lead_source_coef[['source', 'coefficient']])

# You might also want to see the distribution of lead sources in your data
print("\nLead Source Distribution in Training Data:")
print(df_train['lead_source'].value_counts())

Lead Source Impact on Conversion (sorted by coefficient):
                 source  coefficient
0        Direct Traffic     0.000000
2         Google Search     0.000000
5          Social Media     0.000000
3   Product Review Site     0.000000
1  Education Conference    -0.121131
4              Referral    -0.127615

Lead Source Distribution in Training Data:
lead_source
Education Conference    326
Product Review Site     278
Referral                212
Social Media            182
Google Search           181
Direct Traffic          149
Name: count, dtype: int64
