In [0]:
%pip install -r ../requirements.txt
dbutils.library.restartPython()

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, ParameterGrid
import shap
import optuna
from pyspark.sql import functions as F
from pyspark.sql import Window
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score,classification_report


## Read

In [0]:
df = spark.read.table("projectviews.default.gold_projectviews_fe")

## Preprocessing

In [0]:
def preprocess_categorical_and_datetime(df, cat_columns=None, datetime_columns=None):
    """
    - Preprocess categorical and datetime columns.
    - Drop columns refers to the future.
    - Label encode categorical columns.
    - Convert datetime columns to datetime type and extract features.
    
    Parameters:
        df (pd.DataFrame): Original dataframe
        cat_columns (list): List of categorical columns to encode (optional)
        datetime_columns (list): List of datetime columns to process (optional)
    
    Returns:
        df_processed (pd.DataFrame): DataFrame with processed features
        encoders (dict): Dictionary of LabelEncoders for categorical columns
        domain_mapping (dict): Mapping from encoded values back to original domain_code values
    """
    df_processed = df.toPandas()
    encoders = {}
    domain_mapping = {}  
    
    # Remove future columns
    cols_to_drop = [
        'views_plus_1',
        'views_plus_2',
        'views_plus_3',
        'views_plus_4',
        'views_plus_5',
        'views_plus_6',
        'views_plus_7',
        'min_views_future'
    ]
    df_processed = df_processed.drop(columns=cols_to_drop)
    
    # Detect categorical columns automatically if not provided
    if cat_columns is None:
        cat_columns = df_processed.select_dtypes(include=['object']).columns.tolist()
        # Remove datetime columns from categorical columns if datetime_columns given
        if datetime_columns is not None:
            cat_columns = [col for col in cat_columns if col not in datetime_columns]
    
    # Process datetime columns
    if datetime_columns is None:
        datetime_columns = []
    
    for col in datetime_columns:
        df_processed[col] = pd.to_datetime(df_processed[col], errors='coerce')
        df_processed[f'{col}_year'] = df_processed[col].dt.year
        df_processed[f'{col}_month'] = df_processed[col].dt.month
        df_processed[f'{col}_day'] = df_processed[col].dt.day
        df_processed[f'{col}_dayofweek'] = df_processed[col].dt.dayofweek
        df_processed[f'{col}_quarter'] = df_processed[col].dt.quarter
        df_processed[f'{col}_is_weekend'] = (df_processed[col].dt.dayofweek >= 5).astype(int)
        df_processed = df_processed.drop(columns=[col])
    
    # Process categorical columns with Label Encoding
    for col in cat_columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        encoders[col] = le
        
        # If it's domain_code, create inverse mapping
        if col == 'domain_code':
            domain_mapping = {
                encoded: original for encoded, original 
                in zip(le.transform(le.classes_), le.classes_)
            }
    
    return df_processed, encoders, domain_mapping

In [0]:
df_featured, encoders, domain_mapping = preprocess_categorical_and_datetime(df, cat_columns=['domain_code'], datetime_columns=['event_date'])

In [0]:
y = df_featured['churn'].astype(int)
# Compute class ratio from your labels
pos = int((y == 1).sum())
neg = int((y == 0).sum())
scale_pos_weight = (neg / max(pos, 1))
print(f"Class ratio -> pos={pos}, neg={neg}, scale_pos_weight={scale_pos_weight:.2f}")

## Model

In [0]:
def run_grid_search_without_balance(X, y, param_grid, n_splits=5):
    """
    Perform a manual grid search with Stratified K-Fold cross-validation
    to find the best LightGBM hyperparameters based on AUC score.

    Parameters:
    X : pandas.DataFrame
        Feature matrix.
    y : pandas.Series or array-like
        Target variable (binary classification).
    param_grid : dict
        Dictionary of hyperparameters to test, where each key is a parameter
        and each value is a list of possible values to try.
    n_splits : int, default=5
        Number of folds for Stratified K-Fold cross-validation.

    Returns:
    best_params_full : dict
        Dictionary of the best parameters found (including fixed parameters).
    """
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    best_score = -np.inf
    best_params_full = None

    for params in ParameterGrid(param_grid):
        params_full = {
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'random_state': 42,
            'force_col_wise': True,
            **params
        }
        fold_scores = []
        for tr_idx, va_idx in cv.split(X, y):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            dtr = lgb.Dataset(X_tr, label=y_tr)
            dva = lgb.Dataset(X_va, label=y_va, reference=dtr)

            model = lgb.train(params_full, dtr, valid_sets=[dva])
            preds = model.predict(X_va)
            fold_scores.append(roc_auc_score(y_va, preds))

        mean_auc = float(np.mean(fold_scores))
        print(f"Params {params} | AUC={mean_auc:.4f}")
        if mean_auc > best_score:
            best_score = mean_auc
            best_params_full = params_full

    print("Best Parameters:", best_params_full)
    print("Best AUC-CV:", best_score)
    return best_params_full

In [0]:
X = df_featured.drop(columns=['churn'])
y = df_featured['churn'].astype(int)

param_grid = {
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 63],
    'max_depth': [4, 6],
    'min_data_in_leaf': [20],
    'feature_fraction': [0.8],
    'bagging_fraction': [0.8],
    'bagging_freq': [1]
}

def run_grid_search(X, y, param_grid, n_splits=5, use_scale_pos_weight=True):
    """
    Perform a manual grid search with Stratified K-Fold cross-validation
    to find the best LightGBM hyperparameters based on AUC score.

    Parameters:
    X : pandas.DataFrame
        Feature matrix.
    y : pandas.Series or array-like
        Target variable (binary classification).
    param_grid : dict
        Dictionary of hyperparameters to test, where each key is a parameter
        and each value is a list of possible values to try.
    n_splits : int, default=5
        Number of folds for Stratified K-Fold cross-validation.

    Returns:
    best_params_full : dict
        Dictionary of the best parameters found (including fixed parameters).
    """

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    best_score = -np.inf
    best_params_full = None

    pos = int((y == 1).sum())
    neg = int((y == 0).sum())
    spw = (neg / max(pos, 1)) if use_scale_pos_weight else None

    for params in ParameterGrid(param_grid):
        params_full = {
            'objective': 'binary',
            'metric': 'auc',            # we'll also compute AP manually
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'random_state': 42,
            'force_col_wise': True,
            **params
        }
        if use_scale_pos_weight:
            params_full['scale_pos_weight'] = spw
        else:
            params_full['is_unbalance'] = True

        fold_auc, fold_ap = [], []

        for tr_idx, va_idx in cv.split(X, y):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            dtr = lgb.Dataset(X_tr, label=y_tr)
            dva = lgb.Dataset(X_va, label=y_va, reference=dtr)

            model = lgb.train(
                params_full,
                dtr,
                valid_sets=[dva],
                callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
            )
            p = model.predict(X_va, num_iteration=model.best_iteration)
            fold_auc.append(roc_auc_score(y_va, p))
            fold_ap.append(average_precision_score(y_va, p))

        mean_auc, mean_ap = float(np.mean(fold_auc)), float(np.mean(fold_ap))
        print(f"Params {params} | AUC={mean_auc:.4f} | AP={mean_ap:.4f}")

        # Prefer AP when imbalanced; fall back to AUC if you want
        score_for_selection = mean_ap
        if score_for_selection > best_score:
            best_score = score_for_selection
            best_params_full = params_full

    print("Best Parameters:", best_params_full)
    print("Best CV (AP):", best_score)
    return best_params_full

best_params = run_grid_search(X, y, param_grid, n_splits=5)

In [0]:
## Final Model
final_train = lgb.Dataset(X, label=y)
final_model = lgb.train(
    {**best_params, 'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'random_state': 42},
    final_train
)

## Prediction

In [0]:
# Predicts
preds = final_model.predict(X)
df_results = df_featured.copy()
df_results['risk_score'] = preds

display(df_results.head(10))

## Model Explainability

In [0]:
## Shap
explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X)

shap.summary_plot(shap_values, X, plot_type="bar")
shap.summary_plot(shap_values, X)

## Model Evaluation Metrics

In [0]:
# 1) Prepare data
THR = 0.5  # default threshold
y_true = df_results['churn'].astype(int).values # ground truth (0/1)
y_prob = df_results['risk_score'].astype(float).values # predicted probabilities in [0, 1]

# Keep only valid probabilities
mask = (y_prob >= 0.0) & (y_prob <= 1.0)
y_true = y_true[mask]
y_prob = y_prob[mask]

# 2) Choose a better threshold (maximize F1)
prec, rec, thr_grid = precision_recall_curve(y_true, y_prob)
f1_scores = (2 * prec * rec) / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1_scores)
best_thr = thr_grid[max(best_idx - 1, 0)] if len(thr_grid) > 0 else THR
print(f"Suggested threshold by max-F1: {best_thr:.3f}")
# Use either the suggested threshold or stick to 0.5
THR = float(best_thr)  # or keep THR = 0.5

# 3) Hard predictions
y_pred = (y_prob >= THR).astype(int)

# 4) Score distribution
plt.figure()
plt.hist(y_prob, bins=np.linspace(0, 1, 51))
plt.axvline(THR, linestyle='--')
plt.xlabel('risk_score')
plt.ylabel('count')
plt.title('Risk score distribution')
plt.show()

# 5) Confusion Matrix
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1]).plot(values_format='d')
plt.title(f'Confusion Matrix (thr={THR:.3f})')
plt.show()

# 6) ROC curve & AUC
fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend()
plt.show()

# 7) Precision–Recall curve & Average Precision
prec, rec, _ = precision_recall_curve(y_true, y_prob)
ap = average_precision_score(y_true, y_prob)
plt.figure()
plt.plot(rec, prec, label=f'AP = {ap:.3f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision–Recall curve')
plt.legend()
plt.show()

# 8) Detailed metrics
print(classification_report(y_true, y_pred, digits=4))
print(f"AUC: {roc_auc:.4f} | AP: {ap:.4f} | Threshold used: {THR:.3f}")


## Output Table

In [0]:
# Threshold for classification
THR = 0.5

# If mapping is available, add original domain_code
if domain_mapping is not None and 'domain_code' in df_results.columns:
    df_results['domain_code_original'] = df_results['domain_code'].map(domain_mapping)
    print("Original domain_code values recovered")
    print(f"Mapping example: {dict(list(domain_mapping.items())[:5])}")

# Convert to Spark DataFrame again if mapping was added
vw_model_prediction = spark.createDataFrame(df_results)

# Build event_date and predicted churn
vw_model_prediction = (
    vw_model_prediction
    .withColumn(
        "event_date",
        F.make_date(
            F.col("event_date_year").cast("int"),
            F.col("event_date_month").cast("int"),
            F.col("event_date_day").cast("int")
        )
    )
    .withColumn("risk_score", F.col("risk_score").cast("double"))
    .withColumn("predicted_churn", (F.col("risk_score") >= F.lit(THR)).cast("int"))
    .drop("event_date_year", "event_date_month", "event_date_day", "event_date_dayofweek", "event_date_quarter", "event_date_is_weekend")
)

# Save as Delta table
(vw_model_prediction
    .write
    .format("delta")
    .option("mergeSchema", True)
    .mode("overwrite")
    .saveAsTable("projectviews.default.vw_model_prediction")
)

print("vw_model_prediction table successfully created")

In [0]:
vw_model_prediction.columns