In [83]:
import os
import pandas as pd
from pathlib import Path
import numpy as np

In [None]:
data_path = Path("../data")
df_withdrawals = pd.read_parquet(data_path / "zrive_advertiser_withdrawals.parquet")
df_advertiser = pd.read_parquet(data_path / "zrive_dim_advertiser.parquet")
df_monthly = pd.read_parquet(data_path / "zrive_fct_monthly_snapshot_advertiser.parquet")

In [85]:
try:
    # Para archivos .py
    PATH = os.path.abspath(os.path.join(os.path.dirname(__file__),  ".."))
except NameError:
    # Para Jupyter Notebook
    PATH = os.path.abspath(os.path.join(os.getcwd(),  ".."))
PATH=PATH.replace("\\","/")
df_withdrawals = pd.read_parquet(PATH+"/data/zrive_advertiser_withdrawals.parquet")
df_advertiser = pd.read_parquet(PATH+"/data/zrive_dim_advertiser.parquet")
df_monthly = pd.read_parquet(PATH+"/data/zrive_fct_montly_snapshot_advertiser.parquet")

In [86]:
# Withdrawals 
def add_churn(df: pd.DataFrame):
    CHURN_REASONS_EXCLUDED = [
        'Upselling-cambio de contrato',
        'Cambio a Bundle Online',
        'Cambio de Contrato/propuesta/producto'
    ]
    df["churn"] = (
        (df["withdrawal_type"] == "TOTAL") &
        (df["withdrawal_status"] != "Denegada") &
        (~df["withdrawal_reason"].isin(CHURN_REASONS_EXCLUDED))
    ).astype(int)
    return df

def convert_datetime_to_month_period(df, datetime_col, new_col, drop_original=True,n_month:int = 1):
    df[new_col] = pd.to_datetime(df[datetime_col]).dt.to_period('M')
    if drop_original:
        df = df.drop(columns=[datetime_col])
        df["predict_month"] = df[new_col] - n_month
    return df

In [87]:
#df_monthly
def convert_period_int_to_month_period(df, period_col='period_int', new_col='month_period'):
    """Convierte el formato YYYYMM a un período mensual de pandas"""
    df[new_col] = pd.to_datetime(df[period_col].astype(str) + '01', format='%Y%m%d').dt.to_period('M')
    df = df.drop(columns=[period_col])
    return df

def add_churndf_target(df_monthly, df_withdrawals):
    """
    Añade el target de churn a df_monthly, usando lo calculado en df_withdrawals

    Args:
        df_monthly: DataFrame con datos mensuales de anunciantes
        df_withdrawals: DataFrame procesado con información de churn
        
    Returns:
        DataFrame con columna 'churn' añadida
    """
    # Realizar la unión basada en advertiser_id y el período mensual
    df_target = df_monthly.merge(
        df_withdrawals[['advertiser_zrive_id', 'predict_month', 'churn']],
        left_on=['advertiser_zrive_id', 'month_period'],
        right_on=['advertiser_zrive_id', 'predict_month'],
        how='left'
    )

    # Rellenar los valores NaN en churn con 0 (no hubo churn)
    df_target['churn'] = df_target['churn'].fillna(0)*1

    # Eliminar la columna redundante predict_month
    if 'predict_month' in df_target.columns:
        df_target = df_target.drop(columns=['predict_month'])

    return df_target

Se unio la funcion add_predict_month a la funcion de convert_datetime_to_month_period  
En la funcion "convert_period_int_to_month_period" se agrego que se borrara la columna del mes "period_int" (formato YYYYMM)

In [88]:
df_withdrawals = convert_datetime_to_month_period(
    df_withdrawals, 
    datetime_col='withdrawal_creation_date',
    new_col='withdrawal_month'
)
df_withdrawals = add_churn(df_withdrawals)

In [89]:
df_monthly = convert_period_int_to_month_period(df_monthly)
df_target = add_churndf_target(df_monthly, df_withdrawals)

In [90]:
(df_target["monthly_leads"] / df_target["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)

0        0.382979
1        0.129032
2        0.000000
3        0.177215
4        0.800000
           ...   
80559    0.000000
80560    0.176471
80561    3.333333
80562    0.400000
80563    0.000000
Length: 80564, dtype: float64

In [91]:
def calculate_ratios(df):

    df["leads_per_published_ad"] = (df["monthly_leads"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)
    df["leads_per_visit"] = (df["monthly_leads"] / df["monthly_visits"].replace(0, np.nan)).replace(np.nan,0)
    df["visits_per_published_ad"] = (df["monthly_visits"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)
    df["calls_per_published_ad"] = (df["monthly_total_calls"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)
    df["emails_per_published_ad"] = (df["monthly_total_emails"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)

    df["invoice_per_published_ad"] = (df["monthly_total_invoice"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)
    df["invoice_per_lead"] = (df["monthly_total_invoice"] / df["monthly_leads"].replace(0, np.nan)).replace(np.nan,0)

    df["oro_ratio"] = (df["monthly_oro_ads"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)
    df["plata_ratio"] = (df["monthly_plata_ads"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)
    df["destacados_ratio"] = (df["monthly_destacados_ads"] / df["monthly_published_ads"].replace(0, np.nan)).replace(np.nan,0)

    #df["delta_leads"] = df.groupby("advertiser_zrive_id")["monthly_leads"].diff()
    #df["delta_invoice"] = df.groupby("advertiser_zrive_id")["monthly_total_invoice"].diff()

    return df

In [92]:
df_target = calculate_ratios(df_target)

In [96]:
columns_to_train =[
       'monthly_published_ads',
       'monthly_unique_published_ads',
       'monthly_contracted_ads',
       'monthly_leads',
       'monthly_visits',
       'monthly_oro_ads',
       'monthly_plata_ads',
       'monthly_destacados_ads',
       'monthly_pepitas_ads',
       'monthly_shows',
       'monthly_total_phone_views',
       'monthly_total_calls',
       'monthly_total_emails',
       'monthly_total_invoice',
       'monthly_unique_calls',
       'monthly_unique_emails',
       'monthly_unique_leads',
       'has_active_contract',
       'churn',

       'leads_per_published_ad',
       'leads_per_visit',
       'visits_per_published_ad',
       'calls_per_published_ad',
       'emails_per_published_ad',
       'invoice_per_published_ad',
       'invoice_per_lead',
       'oro_ratio',
       'plata_ratio',
       'destacados_ratio',

       #Columnas con datos missing:
       #'monthly_avg_ad_price',
       #'monthly_distinct_ads',
]

In [97]:
df_target[columns_to_train]

Unnamed: 0,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,...,leads_per_published_ad,leads_per_visit,visits_per_published_ad,calls_per_published_ad,emails_per_published_ad,invoice_per_published_ad,invoice_per_lead,oro_ratio,plata_ratio,destacados_ratio
0,47,47,75,18,40890.0,6,6,6,0,2051941.5,...,0.382979,0.000440,870.000000,0.319149,0.000000,9.378723,24.488889,0.127660,0.127660,0.127660
1,31,31,150,4,17970.0,10,10,4,0,1250403.0,...,0.129032,0.000223,579.677419,0.064516,0.064516,2.432258,18.850000,0.322581,0.322581,0.129032
2,0,0,0,0,0.0,0,0,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,79,79,85,14,27157.5,3,3,1,0,1142673.0,...,0.177215,0.000516,343.765823,0.101266,0.025316,3.792405,21.400000,0.037975,0.037975,0.012658
4,20,20,20,16,79492.5,0,0,1,0,1773345.0,...,0.800000,0.000201,3974.625000,0.200000,0.150000,4.325000,5.406250,0.000000,0.000000,0.050000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80559,7,7,10,0,2857.5,1,1,4,0,80052.0,...,0.000000,0.000000,408.214286,0.000000,0.000000,0.000000,0.000000,0.142857,0.142857,0.571429
80560,17,17,20,3,12967.5,0,0,0,0,245994.0,...,0.176471,0.000231,762.794118,0.117647,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
80561,18,18,35,60,74362.5,5,6,15,0,1404081.0,...,3.333333,0.000807,4131.250000,1.722222,0.388889,0.000000,0.000000,0.277778,0.333333,0.833333
80562,10,10,10,4,23857.5,0,0,1,0,302799.0,...,0.400000,0.000168,2385.750000,0.100000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000
