In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
import shap
from pyspark.sql import functions as F
from pyspark.sql import Window
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve, average_precision_score


## Data Collection

In [0]:
df_gold = spark.read.table("projectviews.gold.projectviews_daily_summary")

In [0]:
df_gold.printSchema()

## Define Churn Conditions

In [0]:

def detect_churn(df_gold, n=7, threshold_factor=0.3):
    """
    Detect churn in a time series dataset of views per domain.

    Parameters:
    - df: pandas DataFrame with columns ['domain_code', 'event_date', 'count_views']
    - n: int, number of future days to consider for churn detection (default 7)
    - threshold_factor: float, factor to multiply by past average views to define churn threshold (default 0.3)

    Returns:
    - df: pandas DataFrame with additional columns:
        - avg_views_past_3d: rolling average views of past 3 days (excluding current day)
        - views_plus_i: views for each of the next n days
        - min_views_future: minimum views over the next n days
        - threshold: threshold value for churn detection
        - churn: 1 if minimum future views <= threshold, else 0
    """
    df = df_gold.toPandas()

    df = df.sort_values(['domain_code', 'event_date']).copy()

    # Calculate rolling average of past 3 days (shifted by 1 day to exclude current day)
    df['avg_views_past_3d'] = df.groupby('domain_code')['count_views']\
                                 .transform(lambda x: x.rolling(3, min_periods=1).mean().shift(1))

    # Create columns for future views from day 1 to n
    for i in range(1, n+1):
        df[f'views_plus_{i}'] = df.groupby('domain_code')['count_views'].shift(-i)

    # Calculate minimum views in the future window of n days
    df['min_views_future'] = df[[f'views_plus_{i}' for i in range(1, n+1)]].min(axis=1)

    # Define threshold for churn based on past average views and threshold_factor
    df['threshold'] = df['avg_views_past_3d'] * threshold_factor

    # Label churn = 1 if minimum future views <= threshold, else 0
    df['churn'] = np.where(df['min_views_future'] <= df['threshold'], 1, 0)

    # Remove rows where future data or past average is missing (usually last rows per domain)
    df = df.dropna(subset=['min_views_future', 'avg_views_past_3d'])     

    return df

df_fe = detect_churn(df_gold, n=7, threshold_factor=0.3)

In [0]:
df_fe.count()

### Analysis

In [0]:
spark.createDataFrame(df_fe).createOrReplaceTempView("my_temp_view")

In [0]:
%sql
SELECT churn, count(*) FROM my_temp_view GROUP by 1

In [0]:
%sql
SELECT * FROM my_temp_view where churn =1 

In [0]:
df_dom = df_fe.copy()

df_dom["event_date"] = pd.to_datetime(df_dom["event_date"])
df_dom_ex = df_dom.loc[df_dom["domain_code"] == 'ab.m.d']

plt.figure(figsize=(10,5))
plt.plot(df_dom_ex["event_date"], df_dom_ex["count_views"], marker='o')
plt.title("Daily Active Views Trend for ab.m.d")
plt.xlabel("Date")
plt.ylabel("Views")
plt.legend()
plt.show()

In [0]:
df_dom = df_fe.copy()

df_dom["event_date"] = pd.to_datetime(df_dom["event_date"])
df_dom_ex = df_dom.loc[df_dom["domain_code"] == 'ace']

plt.figure(figsize=(10,5))
plt.plot(df_dom_ex["event_date"], df_dom_ex["count_views"], marker='o')
plt.title("Daily Active Views Trend for ace")
plt.xlabel("Date")
plt.ylabel("Views")
plt.legend()
plt.show()

### Report

In [0]:
### 1) Churn y Retention Funnel (Overall churn)

df_report = spark.createDataFrame(df_fe)
window_spec = Window.partitionBy("domain_code")

df_report = df_report.select(
    F.col("domain_code"),
    F.col("event_date"),
    F.col("count_views"),
    F.col("churn")
).withColumn("join_date", F.min("event_date").over(window_spec))

df_report.write.format("delta").mode("overwrite").saveAsTable("projectviews.gold.vw_churn_retention")


In [0]:
df_report.show()

## Model

In [0]:

def preprocess_categorical_and_datetime(df, cat_columns=None, datetime_columns=None):
    """
    Preprocess categorical and datetime columns.
    
    - Label encode categorical columns
    - Convert datetime columns to datetime type and extract features
    
    Parameters:
        df (pd.DataFrame): Original dataframe
        cat_columns (list): List of categorical columns to encode (optional)
        datetime_columns (list): List of datetime columns to process (optional)
    
    Returns:
        df_processed (pd.DataFrame): DataFrame with processed features
        encoders (dict): Dictionary of LabelEncoders for categorical columns
        domain_mapping (dict): Mapping from encoded values back to original domain_code values
    """
    df_processed = df.copy()
    encoders = {}
    domain_mapping = {}  # Store domain_code mapping
    
    # Remove nulls
    cols_to_check = ['views_plus_2', 'views_plus_3', 'views_plus_4', 'views_plus_5', 'views_plus_6', 'views_plus_7']
    df_processed = df_processed.dropna(subset=cols_to_check)  # Fixed: use df_processed
    
    # Detect categorical columns automatically if not provided
    if cat_columns is None:
        cat_columns = df_processed.select_dtypes(include=['object']).columns.tolist()
        # Remove datetime columns from categorical columns if datetime_columns given
        if datetime_columns is not None:
            cat_columns = [col for col in cat_columns if col not in datetime_columns]
    
    # Process datetime columns
    if datetime_columns is None:
        datetime_columns = []
    
    for col in datetime_columns:
        df_processed[col] = pd.to_datetime(df_processed[col], errors='coerce')
        df_processed[f'{col}_year'] = df_processed[col].dt.year
        df_processed[f'{col}_month'] = df_processed[col].dt.month
        df_processed[f'{col}_day'] = df_processed[col].dt.day
        df_processed[f'{col}_dayofweek'] = df_processed[col].dt.dayofweek
        df_processed[f'{col}_quarter'] = df_processed[col].dt.quarter
        df_processed[f'{col}_is_weekend'] = (df_processed[col].dt.dayofweek >= 5).astype(int)
        df_processed = df_processed.drop(columns=[col])
    
    # Process categorical columns with Label Encoding
    for col in cat_columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        encoders[col] = le
        
        # If it's domain_code, create inverse mapping
        if col == 'domain_code':
            domain_mapping = {
                encoded: original for encoded, original 
                in zip(le.transform(le.classes_), le.classes_)
            }
    
    return df_processed, encoders, domain_mapping

def cross_validate_lgb_explain(X, y, params, domain_mapping=None, n_splits=5):
    """
    Cross-validate LightGBM model with SHAP explanations.
    
    Parameters:
        X (pd.DataFrame): Features
        y (pd.Series): Target variable
        params (dict): LightGBM parameters
        domain_mapping (dict): Mapping from encoded domain_code to original values
        n_splits (int): Number of CV folds
    
    Returns:
        results_df (pd.DataFrame): DataFrame with predictions, true labels, and SHAP values
    """
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    roc_auc_scores = []
    pr_auc_scores = []
    all_preds = pd.Series(index=X.index, dtype=float)  # store risk scores
    
    # store shap values from all folds
    shap_values_all = np.zeros(X.shape)
    
    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            callbacks=[lgb.log_evaluation(10)]
        )
        
        # Risk score predictions
        y_pred = model.predict(X_valid)
        all_preds.iloc[valid_idx] = y_pred
        
        # Metrics
        roc_auc = roc_auc_score(y_valid, y_pred)
        pr_auc = average_precision_score(y_valid, y_pred)
        roc_auc_scores.append(roc_auc)
        pr_auc_scores.append(pr_auc)
        
        # SHAP by fold
        explainer = shap.TreeExplainer(model)
        shap_values_fold = explainer.shap_values(X_valid)
        shap_values_all[valid_idx, :] = shap_values_fold
    
    print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.4f} ± {np.std(roc_auc_scores):.4f}")
    print(f"Mean PR-AUC: {np.mean(pr_auc_scores):.4f} ± {np.std(pr_auc_scores):.4f}")
    
    # Create results DataFrame
    shap_df = pd.DataFrame(shap_values_all, columns=X.columns, index=X.index)
    results_df = X.copy()
    results_df['risk_score'] = all_preds
    results_df['true_label'] = y
    results_df = pd.concat([results_df, shap_df.add_prefix("shap_")], axis=1)
    
    # Recover original domain_code values if mapping exists
    if domain_mapping is not None and 'domain_code' in results_df.columns:
        results_df['domain_code_original'] = results_df['domain_code'].map(domain_mapping)
        print(f"Original domain_code values recovered")
        print(f"Mapping example: {dict(list(domain_mapping.items())[:5])}")
    
    # Global plots
    shap.summary_plot(shap_values_all, X, plot_type="bar", show=False)
    shap.summary_plot(shap_values_all, X, show=False)
    
    return results_df

# Complete usage example:
def run_complete_pipeline(df_fe):
    """
    Complete pipeline with domain_code original value recovery
    """
    print("Starting complete pipeline...")
    
    # 1. Preprocessing
    df_processed, encoders, domain_mapping = preprocess_categorical_and_datetime(
        df_fe, 
        datetime_columns=['event_date']  # Specify datetime columns
    )
    
    print(f"Preprocessing completed")
    print(f"Shape after preprocessing: {df_processed.shape}")
    
    # 2. Prepare data for model
    X = df_processed.drop(columns=['churn'])
    y = df_processed['churn']
    
    # 3. Model parameters (you can use Optuna-optimized ones)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': 0.1,
        'num_leaves': 100,
        'max_depth': 6,
        'min_data_in_leaf': 50,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'random_state': 42,
        'force_col_wise': True
    }
    
    # 4. Cross-validation with SHAP and domain_code recovery
    results_df = cross_validate_lgb_explain(X, y, params, domain_mapping)
    
    print(f"\n Final results:")
    print(f"Columns in results: {results_df.columns.tolist()}")
    
    # Show examples with original domain_code
    if 'domain_code_original' in results_df.columns:
        print(f"\n Sample results with original domain_code:")
        sample_cols = ['domain_code', 'domain_code_original', 'risk_score', 'true_label']
        available_cols = [col for col in sample_cols if col in results_df.columns]
        print(results_df[available_cols].head(10))
    
    return results_df, encoders, domain_mapping

# Call complete function (uncomment to use):
results_df, encoders, domain_mapping = run_complete_pipeline(df_fe)

In [0]:
results_df.columns

In [0]:
results_df

In [0]:
### 3) model outputs

vw_model_prediction = spark.createDataFrame(results_df)
THR = 0.5
vw_model_prediction = (
    vw_model_prediction
    .withColumn(
        "event_date",
        F.make_date(
            F.col("event_date_year").cast("int"),
            F.col("event_date_month").cast("int"),
            F.col("event_date_day").cast("int")
        )
    )
    .withColumn("risk_score", F.col("risk_score").cast("double"))
    .withColumn("predicted_churn", (F.col("risk_score") >= F.lit(THR)).cast("int"))
    # .select(
    #     F.col("domain_code_original").alias("domain_code"),
    #     "event_date",
    #     "count_views",
    #     F.col("true_label").alias("churn"),
    #     "risk_score",
    #     "predicted_churn"
    # )
)


In [0]:
(vw_model_prediction
.write
.format("delta")
.option("mergeSchema", True)
.mode("overwrite")
.saveAsTable("projectviews.gold.vw_model_prediction")
)

In [0]:
%sql

select * from projectviews.gold.vw_model_prediction

## Metrics

In [0]:

THR = 0.5  

y_true = results_df['true_label'].astype(int).values
y_prob = results_df['risk_score'].astype(float).values
mask = (y_prob >= 0.0) & (y_prob <= 1.0)
y_true = y_true[mask]
y_prob = y_prob[mask]
y_pred = (y_prob >= THR).astype(int)


plt.figure()
plt.hist(y_prob, bins=np.linspace(0, 1, 51))
plt.axvline(THR, linestyle='--')
plt.xlabel('risk_score')
plt.ylabel('count')
plt.title('Distribución de risk_score')
plt.show()

cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1]).plot(values_format='d')
plt.title(f'Matriz de confusión (thr={THR})')
plt.show()


fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.legend()
plt.show()


prec, rec, _ = precision_recall_curve(y_true, y_prob)
ap = average_precision_score(y_true, y_prob)
plt.figure()
plt.plot(rec, prec, label=f'AP = {ap:.3f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision–Recall curve')
plt.legend()
plt.show()