In [52]:
import pandas as pd
from typing import List
from pathlib import Path

In [53]:
pd.set_option('display.max_columns', None)

In [54]:
data_path = Path("../data")

In [55]:
df_target = pd.read_parquet(data_path / "processed_data.parquet")

In [56]:
df_target.head()

Unnamed: 0,advertiser_zrive_id,period_int,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,month_period,churn
0,1,202301,47,47,75,18,40890.0,6,6,6,0,2051941.5,14,15,0,440.8,12,3,15,,,True,2023-01,0.0
1,2,202301,31,31,150,4,17970.0,10,10,4,0,1250403.0,16,2,2,75.4,2,2,4,,,True,2023-01,0.0
3,4,202301,79,79,85,14,27157.5,3,3,1,0,1142673.0,10,8,2,299.6,6,6,12,,,True,2023-01,0.0
4,6,202301,20,20,20,16,79492.5,0,0,1,0,1773345.0,10,4,3,86.5,4,11,15,,,True,2023-01,0.0
5,7,202301,42,42,50,15,42217.5,1,1,1,0,1633705.5,18,5,3,196.3,4,7,11,14700.0,5.0,True,2023-01,0.0


In [57]:
nans_per_column = df_target.isnull().sum()

columnas_con_nan = nans_per_column[nans_per_column > 0].index.tolist()

print(nans_per_column[nans_per_column > 0])
print(f"Total number of rows: {len(df_target)}")

monthly_avg_ad_price    15922
monthly_distinct_ads    16537
dtype: int64
Total number of rows: 56335


In [58]:
MONTHLY_COLUMNS_TO_REMOVE = ['monthly_avg_ad_price', 'monthly_distinct_ads']

df_target.drop(columns=MONTHLY_COLUMNS_TO_REMOVE, inplace=True)

In [59]:
df_target.corr(numeric_only=True)['churn'].sort_values(ascending=False)

churn                           1.000000
advertiser_zrive_id             0.080848
has_active_contract             0.040652
monthly_oro_ads                 0.002606
monthly_pepitas_ads            -0.003414
monthly_destacados_ads         -0.005066
monthly_plata_ads              -0.013256
monthly_unique_published_ads   -0.026339
monthly_shows                  -0.032365
monthly_total_calls            -0.034665
period_int                     -0.036360
monthly_total_phone_views      -0.037909
monthly_visits                 -0.038025
monthly_unique_calls           -0.038269
monthly_published_ads          -0.039445
monthly_total_emails           -0.039575
monthly_leads                  -0.040225
monthly_contracted_ads         -0.042105
monthly_unique_leads           -0.042779
monthly_unique_emails          -0.044587
monthly_total_invoice          -0.053997
Name: churn, dtype: float64

In [60]:
df_target['visits_per_lead'] = df_target['monthly_visits'] / df_target['monthly_leads'].replace(0, 1)
df_target['leads_per_call'] = df_target['monthly_leads'] / df_target['monthly_total_calls'].replace(0, 1)
df_target['visits_per_call'] = df_target['monthly_visits'] / df_target['monthly_total_calls'].replace(0, 1)
df_target['visits_per_email'] = df_target['monthly_visits'] / df_target['monthly_total_emails'].replace(0, 1)
df_target['invoice_per_lead'] = df_target['monthly_total_invoice'] / df_target['monthly_leads'].replace(0, 1)
df_target['invoice_per_visit'] = df_target['monthly_total_invoice'] / df_target['monthly_visits'].replace(0, 1)
df_target['invoice_per_call'] = df_target['monthly_total_invoice'] / df_target['monthly_total_calls'].replace(0, 1)
df_target['contracted_vs_real'] = df_target['monthly_contracted_ads'] / df_target['monthly_published_ads'].replace(0, 1)


In [None]:
df_target = df_target.sort_values(['advertiser_zrive_id', 'period_int'])

df_target['n_months_contract_active'] = (
    df_target.groupby('advertiser_zrive_id')['has_active_contract']
    .transform(lambda x: x * (x.groupby((x != x.shift()).cumsum()).cumcount() + 1))
)

In [None]:
df_target['monthly_visits_mean_past'] = (
    df_target.groupby('advertiser_zrive_id')['monthly_visits']
    .transform(lambda x: x.shift().expanding().mean())
)

df_target['visits_trend'] = df_target['monthly_visits'] - df_target['monthly_visits_mean_past']

In [None]:
def add_ratios(
    df: pd.DataFrame,
    metrics: List[str],
    n_months: int,
    ratios: List[str] = ["max", "min", "mean", "std"]
) -> pd.DataFrame:
    for metric in metrics:
        for ratio in ratios:
            df[f'{ratio}_{metric}_{n_months}m'] = (
                df.groupby('advertiser_zrive_id')[metric]
                .transform(lambda x: x.shift().rolling(n_months).agg(ratio))
            )

    return df

df_target = add_ratios(df_target, ["monthly_leads", "monthly_visits"], 3)