In [1]:
from datetime import datetime, timedelta
from collections import defaultdict

import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, make_pipeline

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


In [5]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("model")

<Experiment: artifact_location='/workspaces/lld-lead-scoring/analytics/mlruns/2', creation_time=1731673524680, experiment_id='2', last_update_time=1731673524680, lifecycle_stage='active', name='decision_tree_tuning', tags={}>

In [6]:
df_users = pd.read_csv('../data/random-users.csv')
df_logs = pd.read_csv('../data/random-logs.csv')

In [7]:
def preprocess_required_features(df):
    df = df.copy()
    
    def process_feature_string(feature_string):
        features = [f.strip() for f in feature_string.split(',')]
        return {f'required_feature_{feature}': 1 for feature in features}
    
    feature_dicts = df['required_features'].apply(process_feature_string)
    df = df.drop('required_features', axis=1)
    record_dicts = df.to_dict('records')
    
    for record, feature_dict in zip(record_dicts, feature_dicts):
        record.update(feature_dict)
    
    return record_dicts

In [8]:
def create_log_features(df_users, df_logs, cutoff_date):
    df_logs_filtered = df_logs[df_logs['timestamp'] < cutoff_date].copy()
    
    engagement_metrics = df_logs_filtered.groupby('user_id').agg({
        'timestamp': ['count', 'nunique'],  # Total actions and unique days
        'duration_seconds': ['sum', 'mean', 'std']  # Time spent metrics
    }).round(2)
    
    engagement_metrics.columns = [
        'total_actions',
        'active_days',
        'total_duration',
        'avg_duration',
        'std_duration'
    ]
    
    # Action category distribution
    category_counts = df_logs_filtered.groupby(['user_id', 'action_category']).size().unstack(
        fill_value=0
    ).add_prefix('category_')
    
    # Action type distribution (top 10 most common)
    top_actions = df_logs_filtered['action_type'].value_counts().nlargest(10).index
    action_counts = df_logs_filtered[df_logs_filtered['action_type'].isin(top_actions)]\
        .groupby(['user_id', 'action_type']).size().unstack(fill_value=0).add_prefix('action_')
    
    # Time-based features
    df_logs_filtered['hour'] = df_logs_filtered['timestamp'].dt.hour
    time_metrics = df_logs_filtered.groupby('user_id').agg({
        'hour': lambda x: len(x[x.between(9, 17)]) / len(x)  # Fraction of activity during business hours
    }).round(2)
    time_metrics.columns = ['business_hours_ratio']
    
    # Activity patterns
    df_logs_filtered['days_since_signup'] = (
        df_logs_filtered['timestamp'] - 
        df_logs_filtered['user_id'].map(df_users.set_index('user_id')['signup_date'])
    ).dt.days
    
    recency_metrics = df_logs_filtered.groupby('user_id').agg({
        'days_since_signup': ['min', 'max']
    }).round(2)
    recency_metrics.columns = ['days_to_first_action', 'days_to_last_action']
    
    # Advanced engagement metrics
    df_logs_filtered['prev_timestamp'] = df_logs_filtered.groupby('user_id')['timestamp'].shift(1)
    df_logs_filtered['time_between_actions'] = (
        df_logs_filtered['timestamp'] - df_logs_filtered['prev_timestamp']
    ).dt.total_seconds() / 3600  # Convert to hours
    
    engagement_patterns = df_logs_filtered.groupby('user_id').agg({
        'time_between_actions': ['mean', 'std']
    }).round(2)
    engagement_patterns.columns = ['avg_hours_between_actions', 'std_hours_between_actions']
    
    # Feature importance indicators
    feature_exploration = df_logs_filtered[
        df_logs_filtered['action_type'] == 'view_features'
    ].groupby('user_id').size().to_frame('feature_views')
    
    # Combine all features
    log_features = pd.concat([
        engagement_metrics,
        category_counts,
        action_counts,
        time_metrics,
        recency_metrics,
        engagement_patterns,
        feature_exploration
    ], axis=1).reset_index()
    
    # Fill NaN values with 0 for new users or users with missing metrics
    log_features = log_features.fillna(0)
    
    return log_features


def prepare_lead_scoring_data(df_users, df_logs, train_end_date, val_end_date):
    df_users = df_users.copy()
    df_logs = df_logs.copy()
    
    df_users['signup_date'] = pd.to_datetime(df_users['signup_date'])
    df_logs['timestamp'] = pd.to_datetime(df_logs['timestamp'])
    
    train_end_date = pd.to_datetime(train_end_date)
    val_end_date = pd.to_datetime(val_end_date)
    
    train_mask = df_users['signup_date'] < train_end_date
    val_mask = (df_users['signup_date'] >= train_end_date) & (df_users['signup_date'] < val_end_date)
    test_mask = df_users['signup_date'] >= val_end_date
    
    df_train = df_users[train_mask].copy()
    df_val = df_users[val_mask].copy()
    df_test = df_users[test_mask].copy()
    
    train_features = create_log_features(df_users, df_logs, train_end_date)
    val_features = create_log_features(df_users, df_logs, val_end_date)
    test_features = create_log_features(df_users, df_logs, df_logs['timestamp'].max())

    df_train = df_train.merge(train_features, on='user_id', how='left')
    df_val = df_val.merge(val_features, on='user_id', how='left')
    df_test = df_test.merge(test_features, on='user_id', how='left')

    return df_train, df_val, df_test

In [9]:
df_train, df_val, df_test = prepare_lead_scoring_data(
    df_users,
    df_logs,
    train_end_date='2024-03-01',
    val_end_date='2024-03-15'
)

In [10]:
def prepare_features(df):
    df = df.copy()
    
    date_columns = ['signup_date', 'conversion_date']
    exclude_columns = ['user_id', 'converted'] + date_columns

    df = df.drop(columns=exclude_columns)
    df = df.fillna(0)
    feature_dict = preprocess_required_features(df)

    return feature_dict

train_dicts = prepare_features(df_train)
val_dicts = prepare_features(df_val)
test_dicts = prepare_features(df_test)

y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
with mlflow.start_run():
    model_params = {
        'n_estimators': 75,
        'max_depth': 9,
        'min_samples_leaf': 43,
        'min_samples_split': 98,
        'class_weight': 'balanced',
        'random_state': 1
    }
    mlflow.log_params(model_params)

    pipeline = make_pipeline(
        DictVectorizer(),
        RandomForestClassifier(**model_params)
    )

    full_train = train_dicts + val_dicts
    y_full = np.concatenate([y_train, y_val])
    pipeline.fit(full_train, y_full)
        
    # Calculate and log test AUC
    y_test_pred = pipeline.predict_proba(test_dicts)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_pred)
    mlflow.log_metric("test_auc", test_auc)
    print(f"Test AUC: {test_auc:.4f}")

    all_data = train_dicts + val_dicts + test_dicts
    y_all = np.concatenate([y_train, y_val, y_test])

    final_model = make_pipeline(
        DictVectorizer(),
        RandomForestClassifier(**model_params)
    )

    final_model.fit(all_data, y_all)
    mlflow.sklearn.log_model(final_model, "final_model")

Test AUC: 0.6730


2024/11/17 15:16:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run nebulous-dove-608 at: http://localhost:5000/#/experiments/5/runs/92db624fe3e149dba3d0b5f9166e2c23.
2024/11/17 15:16:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/5.


In [44]:
logged_model = 'runs:/92db624fe3e149dba3d0b5f9166e2c23/final_model'
loaded_model = mlflow.sklearn.load_model(logged_model)
loaded_model.predict_proba(val_dicts)[:, 1]

array([0.51194242, 0.57780521, 0.47095262, 0.45629817, 0.50553459,
       0.47032588, 0.5232474 , 0.4640846 , 0.43753661, 0.52722325,
       0.50566772, 0.49709642, 0.59600523, 0.50765233, 0.40345121,
       0.5184669 , 0.43234984, 0.54414709, 0.54570905, 0.60975858,
       0.50432322, 0.66253257, 0.47661752, 0.44360253, 0.518128  ,
       0.25659342, 0.29937138, 0.36437496, 0.45766425, 0.50134832,
       0.54787848, 0.36529574, 0.61626696, 0.53231782, 0.54853905,
       0.37509222, 0.49399716, 0.53419116, 0.5659114 , 0.52065272,
       0.30012428, 0.510526  , 0.33549252, 0.48410629, 0.54317495,
       0.48383985, 0.57280271, 0.50310186, 0.45913881, 0.51469294,
       0.45068373, 0.64320719, 0.62766363, 0.48621956, 0.5794546 ,
       0.31482529, 0.51602372, 0.45812934, 0.38019153, 0.48037609,
       0.4668738 , 0.50397325, 0.59900223, 0.35564385, 0.58768787,
       0.49251879, 0.3828128 , 0.55015086, 0.45099531, 0.55814729,
       0.34858624, 0.48465322, 0.50716374, 0.38920424, 0.40912