In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import optuna
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
import shap
from pyspark.sql import functions as F
from pyspark.sql import Window

## Data Colection

In [0]:
df_gold = spark.read.table("projectviews.gold.projectviews_daily_summary")

In [0]:
df_gold.printSchema()

## Define Churn Conditions

In [0]:

def detect_churn(df_gold, n=7, threshold_factor=0.3):
    """
    Detect churn in a time series dataset of views per domain.

    Parameters:
    - df: pandas DataFrame with columns ['domain_code', 'event_date', 'count_views']
    - n: int, number of future days to consider for churn detection (default 7)
    - threshold_factor: float, factor to multiply by past average views to define churn threshold (default 0.3)

    Returns:
    - df: pandas DataFrame with additional columns:
        - avg_views_past_3d: rolling average views of past 3 days (excluding current day)
        - views_plus_i: views for each of the next n days
        - min_views_future: minimum views over the next n days
        - threshold: threshold value for churn detection
        - churn: 1 if minimum future views <= threshold, else 0
    """
    df = df_gold.toPandas()

    df = df.sort_values(['domain_code', 'event_date']).copy()

    # Calculate rolling average of past 3 days (shifted by 1 day to exclude current day)
    df['avg_views_past_3d'] = df.groupby('domain_code')['count_views']\
                                 .transform(lambda x: x.rolling(3, min_periods=1).mean().shift(1))

    # Create columns for future views from day 1 to n
    for i in range(1, n+1):
        df[f'views_plus_{i}'] = df.groupby('domain_code')['count_views'].shift(-i)

    # Calculate minimum views in the future window of n days
    df['min_views_future'] = df[[f'views_plus_{i}' for i in range(1, n+1)]].min(axis=1)

    # Define threshold for churn based on past average views and threshold_factor
    df['threshold'] = df['avg_views_past_3d'] * threshold_factor

    # Label churn = 1 if minimum future views <= threshold, else 0
    df['churn'] = np.where(df['min_views_future'] <= df['threshold'], 1, 0)

    # Remove rows where future data or past average is missing (usually last rows per domain)
    df = df.dropna(subset=['min_views_future', 'avg_views_past_3d'])     

    return df

df_fe = detect_churn(df_gold, n=7, threshold_factor=0.3)

In [0]:
df_fe.count()

### Analysis

In [0]:
spark.createDataFrame(df_fe).createOrReplaceTempView("my_temp_view")

In [0]:
%sql
SELECT churn, count(*) FROM my_temp_view GROUP by 1

In [0]:
%sql
SELECT * FROM my_temp_view where churn =1 

In [0]:
df_dom = df_fe.copy()

df_dom["event_date"] = pd.to_datetime(df_dom["event_date"])
df_dom_ex = df_dom.loc[df_dom["domain_code"] == 'ab.m.d']

plt.figure(figsize=(10,5))
plt.plot(df_dom_ex["event_date"], df_dom_ex["count_views"], marker='o')
plt.title("Daily Active Views Trend for ab.m.d")
plt.xlabel("Date")
plt.ylabel("Views")
plt.legend()
plt.show()

In [0]:
df_dom = df_fe.copy()

df_dom["event_date"] = pd.to_datetime(df_dom["event_date"])
df_dom_ex = df_dom.loc[df_dom["domain_code"] == 'ace']

plt.figure(figsize=(10,5))
plt.plot(df_dom_ex["event_date"], df_dom_ex["count_views"], marker='o')
plt.title("Daily Active Views Trend for ace")
plt.xlabel("Date")
plt.ylabel("Views")
plt.legend()
plt.show()

### Report

In [0]:
### 1) Churn y Retention Funnel (Overall churn)

df_report = spark.createDataFrame(df_fe)
window_spec = Window.partitionBy("domain_code")

df_report = df_report.select(
    F.col("domain_code"),
    F.col("event_date"),
    F.col("count_views"),
    F.col("churn")
).withColumn("join_date", F.min("event_date").over(window_spec))

df_report.write.format("delta").mode("overwrite").saveAsTable("projectviews.gold.vw_churn_retention")


In [0]:
df_report.show()

## Processing

In [0]:
def preprocess_categorical_and_datetime(df, cat_columns=None, datetime_columns=None):
    """
    Preprocess categorical and datetime columns.
    
    - Label encode categorical columns
    - Convert datetime columns to datetime type and extract features
    
    Parameters:
        df (pd.DataFrame): Original dataframe
        cat_columns (list): List of categorical columns to encode (optional)
        datetime_columns (list): List of datetime columns to process (optional)
    
    Returns:
        df_processed (pd.DataFrame): DataFrame with processed features
        encoders (dict): Dictionary of LabelEncoders for categorical columns
    """
    df_processed = df.copy()
    encoders = {}

    # Remove nulls 
    cols_to_check = ['views_plus_2', 'views_plus_3', 'views_plus_4', 'views_plus_5', 'views_plus_6', 'views_plus_7']
    df = df.dropna(subset=cols_to_check) 

    
    # Detect categorical columns automatically if not provided
    if cat_columns is None:
        cat_columns = df_processed.select_dtypes(include=['object']).columns.tolist()
        # Remove datetime columns from categorical columns if datetime_columns given
        if datetime_columns is not None:
            cat_columns = [col for col in cat_columns if col not in datetime_columns]
    
    # Process datetime columns
    if datetime_columns is None:
        datetime_columns = []
    
    for col in datetime_columns:
        df_processed[col] = pd.to_datetime(df_processed[col], errors='coerce')
        df_processed[f'{col}_year'] = df_processed[col].dt.year
        df_processed[f'{col}_month'] = df_processed[col].dt.month
        df_processed[f'{col}_day'] = df_processed[col].dt.day
        df_processed[f'{col}_dayofweek'] = df_processed[col].dt.dayofweek
        df_processed[f'{col}_quarter'] = df_processed[col].dt.quarter
        df_processed[f'{col}_is_weekend'] = (df_processed[col].dt.dayofweek >= 5).astype(int)
         
    # Process categorical columns with Label Encoding
    for col in cat_columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        encoders[col] = le
    
    return df_processed, encoders


In [0]:
cat_cols = ['domain_code']          
datetime_cols = ['event_date']      

df_processed, encoders = preprocess_categorical_and_datetime(df_fe, cat_columns=cat_cols, datetime_columns=datetime_cols)


In [0]:
df_processed

## Division Train and Test

In [0]:
def train_test_split(df, date_column='event_date', target_column='churn', test_size=0.2):
    """
    Split the dataframe into train and test sets based on date order.
    The latest test_size portion of dates goes to test, the rest to train.
    """
    df = df.sort_values(date_column)
    
    cutoff_idx = int(len(df) * (1 - test_size))
    
    train_df = df.iloc[:cutoff_idx]
    test_df = df.iloc[cutoff_idx:]
    
    X_train = train_df.drop(columns=[target_column, date_column])
    y_train = train_df[target_column]
    
    X_test = test_df.drop(columns=[target_column, date_column])
    y_test = test_df[target_column]
    
    return X_train, X_test, y_train, y_test

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df_processed, date_column='event_date', target_column='churn', test_size=0.2)


## Model

In [0]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score

# Crear datasets LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

# Parámetros básicos LightGBM
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'scale_pos_weight': (len(y_train) - sum(y_train)) / sum(y_train),  # Para desbalance de clases
    'learning_rate': 0.1,
    'num_leaves': 31
}

# Entrenar modelo
model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    callbacks=[lgb.log_evaluation(10)]  # imprime cada 10 iteraciones
)

# Predecir en test
y_pred_proba = model.predict(X_test)

# Evaluar
roc_auc = roc_auc_score(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")

In [0]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

def cross_validate_lgb(X, y, params, n_splits=5):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    roc_auc_scores = []
    pr_auc_scores = []
    
    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            callbacks=[lgb.log_evaluation(10)]  
        )
        
        y_pred = model.predict(X_valid)
        roc_auc = roc_auc_score(y_valid, y_pred)
        pr_auc = average_precision_score(y_valid, y_pred)
        
        roc_auc_scores.append(roc_auc)
        pr_auc_scores.append(pr_auc)
    
    print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.4f} ± {np.std(roc_auc_scores):.4f}")
    print(f"Mean PR-AUC: {np.mean(pr_auc_scores):.4f} ± {np.std(pr_auc_scores):.4f}")

    # Calcular valores SHAP
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)

    # Gráfico resumen de importancia SHAP
    shap.summary_plot(shap_values, X, plot_type="bar")

    # Gráfico de dispersión SHAP para las variables top 5
    shap.summary_plot(shap_values, X)

# Ejemplo de parámetros básicos
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbosity': -1,
    'seed': 42,
    'scale_pos_weight': (len(y) - sum(y)) / sum(y)  # balance class imbalance
}


X = df_processed.drop(columns=['churn', 'event_date'])
y = df_processed['churn']
cross_validate_lgb(X, y, params)