In [2]:
import pandas as pd
from pathlib import Path

In [3]:
data_path = Path("../data")

In [54]:
df_withdrawals = pd.read_parquet(data_path / "zrive_advertiser_withdrawals.parquet")
df_advertiser = pd.read_parquet(data_path / "zrive_dim_advertiser.parquet")
df_monthly = pd.read_parquet(data_path / "zrive_fct_monthly_snapshot_advertiser.parquet")

In [55]:
# Withdrawals 
def add_churn(df: pd.DataFrame):
    CHURN_REASONS_EXCLUDED = [
        'Upselling-cambio de contrato',
        'Cambio a Bundle Online',
        'Cambio de Contrato/propuesta/producto'
    ]
    df["churn"] = (
        (df["withdrawal_type"] == "TOTAL") &
        (df["withdrawal_status"] != "Denegada") &
        (~df["withdrawal_reason"].isin(CHURN_REASONS_EXCLUDED))
    ).astype(int)
    return df

def convert_datetime_to_month_period(df, datetime_col, new_col, drop_original=True):
    df[new_col] = pd.to_datetime(df[datetime_col]).dt.to_period('M')
    if drop_original:
        df = df.drop(columns=[datetime_col])
    return df

def add_predict_month(df: pd.DataFrame, predict_col = "predict_month", withdrawal_col="withdrawal_month", n: int = 1):
    df[predict_col] = df[withdrawal_col] - n
    return df


In [56]:
WITHDRAWAL_COLS_TO_DROP = ["withdrawal_id", "withdrawal_status", "withdrawal_type", "withdrawal_reason", "withdrawal_month"]

df_withdrawals = convert_datetime_to_month_period(
    df_withdrawals, 
    datetime_col='withdrawal_creation_date',
    new_col='withdrawal_month',
    drop_original=True
)
df_withdrawals = add_predict_month(df_withdrawals, n=1)
df_withdrawals = add_churn(df_withdrawals)
df_withdrawals.drop(columns=WITHDRAWAL_COLS_TO_DROP, inplace=True)

In [57]:
df_withdrawals

Unnamed: 0,advertiser_zrive_id,predict_month,churn
0,257,2012-05,1
1,219,2012-05,1
2,487,2012-05,1
3,476,2012-05,1
4,452,2012-05,1
...,...,...,...
20674,5441,2024-11,1
20675,5439,2024-11,1
20676,154,2024-11,1
20677,1352,2024-11,1


In [58]:
# Advertiser
def add_months_active(df: pd.DataFrame):
    df['updated_at'] = pd.to_datetime(df['updated_at'], errors='coerce')
    df['min_start_contrato_date'] = pd.to_datetime(df['min_start_contrato_date'], errors='coerce')
    df['months_active'] = (
        (df['updated_at'].dt.year - df['min_start_contrato_date'].dt.year) * 12 +
        (df['updated_at'].dt.month - df['min_start_contrato_date'].dt.month)
    )
    return df

In [59]:
ADVERTISER_COLS_TO_DROP = ["province_id", "advertiser_province", "advertiser_group_id", "min_start_contrato_date", "max_start_contrato_nuevo_date", "contrato_churn_date"]


df_advertiser = add_months_active(df_advertiser)
df_advertiser['advertiser_group'] = df_advertiser['advertiser_group_id'].notna()
df_advertiser = convert_datetime_to_month_period(
    df_advertiser, 
    datetime_col='updated_at',
    new_col='predict_month',
    drop_original=True
)
df_advertiser.drop(columns=ADVERTISER_COLS_TO_DROP, inplace=True)

In [60]:
df_advertiser

Unnamed: 0,advertiser_zrive_id,months_active,advertiser_group,predict_month
0,6732,1,False,2025-02
1,4841,18,True,2024-08
2,2487,3,False,2025-01
3,1771,36,True,2024-11
4,3396,8,False,2023-11
...,...,...,...,...
6829,6079,3,False,2025-02
6830,4775,3,False,2025-03
6831,4712,17,True,2024-06
6832,5686,13,False,2025-03


In [67]:
df_withdrawals

Unnamed: 0,advertiser_zrive_id,predict_month,churn
0,257,2012-05,1
1,219,2012-05,1
2,487,2012-05,1
3,476,2012-05,1
4,452,2012-05,1
...,...,...,...
20674,5441,2024-11,1
20675,5439,2024-11,1
20676,154,2024-11,1
20677,1352,2024-11,1


In [91]:
def add_churn_column(df_advertiser: pd.DataFrame, df_withdrawals: pd.DataFrame) -> pd.DataFrame:
    """
    Merges advertiser data with withdrawal (churn) data.
    Ensures churn column is added, and keeps all rows from both DataFrames,
    including unique rows from df_withdrawals not present in df_advertiser.

    Parameters:
    - df_advertiser (pd.DataFrame): The main advertiser data with 'advertiser_zrive_id' and 'predict_month'.
    - df_withdrawals (pd.DataFrame): DataFrame with churn info and potentially additional columns.

    Returns:
    - pd.DataFrame: Combined DataFrame with advertiser features, churn info, and all unique combinations.
    """
    df_advertiser = df_advertiser.copy()
    df_withdrawals = df_withdrawals.copy()

    # Step 1: Get all unique keys from both DataFrames
    advertiser_keys = df_advertiser[["advertiser_zrive_id", "predict_month"]]
    withdrawal_keys = df_withdrawals[["advertiser_zrive_id", "predict_month"]]
    all_keys = pd.concat([advertiser_keys, withdrawal_keys]).drop_duplicates()

    # Step 2: Merge with churn info
    merged_with_churn = all_keys.merge(
        df_withdrawals,
        on=["advertiser_zrive_id", "predict_month"],
        how="left"
    )
    merged_with_churn["churn"] = merged_with_churn["churn"].fillna(0).astype(int)

    # Step 3: Merge with advertiser features (may result in NaNs for rows only in withdrawals)
    final_df = merged_with_churn.merge(
        df_advertiser,
        on=["advertiser_zrive_id", "predict_month"],
        how="left"
    )

    # Optional: reorder columns for readability
    churn_col = final_df.pop("churn")
    final_df.insert(len(final_df.columns), "churn", churn_col)

    return final_df

def fill_advertiser_group(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing 'advertiser_group' values by looking up existing values for the same advertiser_zrive_id.
    If no value is found, set to False.
    """
    df = df.copy()

    # Fill missing advertiser_group using the first non-null value per advertiser_zrive_id
    df["advertiser_group"] = df.groupby("advertiser_zrive_id")["advertiser_group"].transform(
        lambda x: x.fillna(method="ffill").fillna(method="bfill")
    )

    # Set still-missing values to False
    df["advertiser_group"] = df["advertiser_group"].fillna(False).astype(bool)

    return df

In [95]:
df_adv = add_churn_column(df_advertiser, df_withdrawals)
df_adv = fill_advertiser_group(df_adv)
df_adv.drop(columns=["months_active"], inplace=True)

  lambda x: x.fillna(method="ffill").fillna(method="bfill")
  lambda x: x.fillna(method="ffill").fillna(method="bfill")


In [96]:
df_adv

Unnamed: 0,advertiser_zrive_id,predict_month,advertiser_group,churn
0,6732,2025-02,False,0
1,4841,2024-08,True,0
2,2487,2025-01,False,0
3,1771,2024-11,True,0
4,3396,2023-11,False,0
...,...,...,...,...
27430,5441,2024-11,False,1
27431,5439,2024-11,False,1
27432,154,2024-11,False,1
27433,1352,2024-11,False,1
