In [1]:
import pandas as pd
from pathlib import Path

### Withdrawals

In [2]:
# Data Read
data_path = Path("../data")
df_withdrawals = pd.read_parquet(data_path / "zrive_advertiser_withdrawals.parquet")


In [3]:
CHURN_REASONS_EXCLUDED = [
    'Upselling-cambio de contrato',
    'Cambio a Bundle Online',
    'Cambio de Contrato/propuesta/producto'
]

df_withdrawals["churn"] = (
    (df_withdrawals["withdrawal_type"] == "TOTAL") &
    (df_withdrawals["withdrawal_status"] != "Denegada") &
    (~df_withdrawals["withdrawal_reason"].isin(CHURN_REASONS_EXCLUDED))
).astype(int)

In [4]:
df_withdrawals

Unnamed: 0,withdrawal_id,advertiser_zrive_id,withdrawal_status,withdrawal_type,withdrawal_creation_date,withdrawal_reason,churn
0,0,257,Cerrada,TOTAL,2012-06-19 07:12:34,RESULTADOS,1
1,1,219,Cerrada,TOTAL,2012-06-19 07:16:34,RESULTADOS,1
2,7,487,Cerrada,TOTAL,2012-06-20 07:10:16,RESULTADOS,1
3,12,476,Cerrada,TOTAL,2012-06-20 11:59:36,RESULTADOS,1
4,16,452,Cerrada,TOTAL,2012-06-20 15:41:39,FALTA DE USO/TIEMPO,1
...,...,...,...,...,...,...,...
20674,53126,5441,Cerrada,TOTAL,2024-12-30 21:46:47,FALTA DE USO/TIEMPO,1
20675,53127,5439,Cerrada,TOTAL,2024-12-30 21:47:47,FALTA DE USO/TIEMPO,1
20676,53128,154,Cerrada,TOTAL,2024-12-30 22:08:19,RAZONES ECONOMICAS,1
20677,53129,1352,Cerrada,TOTAL,2024-12-30 22:13:53,RAZONES ECONOMICAS,1


Data to be used for defining churn (joining by advertiser and defining the withdrawal date)

### Advertisers

In [5]:
data_path = Path("../data")
df_advertiser = pd.read_parquet(data_path / "zrive_dim_advertiser.parquet")

In [6]:
df_advertiser.head(20)

Unnamed: 0,advertiser_zrive_id,province_id,updated_at,advertiser_province,advertiser_group_id,min_start_contrato_date,max_start_contrato_nuevo_date,contrato_churn_date
0,6732,1,2025-02-05 01:02:08,Álava,,2025-01-24,,2025-02-04
1,4841,2,2024-08-09 13:38:43,Albacete,133.0,2023-02-24,,2023-06-03
2,2487,3,2025-01-13 17:41:02,Alicante,,2024-10-02,2024-10-02,2025-01-13
3,1771,3,2024-11-07 13:26:50,Alicante,185.0,2021-11-15,,2024-11-06
4,3396,3,2023-11-02 13:51:07,Alicante,,2023-03-17,2023-03-17,2023-11-01
5,3008,3,2024-04-25 14:37:44,Alicante,,2024-02-02,,2024-04-09
6,4836,3,2024-08-09 13:40:33,Alicante,133.0,2023-02-24,,2023-06-03
7,4972,3,2024-12-04 11:55:31,Alicante,,2023-04-03,,2023-07-02
8,542,3,2024-02-05 13:49:02,Alicante,,2023-05-03,,2023-12-29
9,4595,3,2023-03-07 01:13:14,Alicante,,2022-12-19,,2023-03-06


- Initially ignore the region (study then after a first iteration if it adds value)
- Add a boolean specifying if it's new or comes back
- Advertiser group id to convert to boolean column
- Feature engineer could be like % of time till contract expiry

### Monthly

In [7]:
data_path = Path("../data")
df_monthly = pd.read_parquet(data_path / "zrive_fct_monthly_snapshot_advertiser.parquet")

In [8]:
df_monthly

Unnamed: 0,advertiser_zrive_id,period_int,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,...,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract
0,1,202301,47,47,75,18,40890.0,6,6,6,...,14,15,0,440.8,12,3,15,,,True
1,2,202301,31,31,150,4,17970.0,10,10,4,...,16,2,2,75.4,2,2,4,,,True
2,3,202301,0,0,0,0,0.0,0,0,0,...,0,0,0,0.0,0,0,0,,,False
3,4,202301,79,79,85,14,27157.5,3,3,1,...,10,8,2,299.6,6,6,12,,,True
4,6,202301,20,20,20,16,79492.5,0,0,1,...,10,4,3,86.5,4,11,15,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80122,6800,202412,7,7,10,0,2857.5,1,1,4,...,0,0,0,0.0,0,0,0,8840.00,8.0,False
80123,6808,202412,17,17,20,3,12967.5,0,0,0,...,3,2,0,0.0,2,1,3,36487.70,28.0,False
80124,6822,202412,18,18,35,60,74362.5,5,6,15,...,21,31,7,0.0,26,26,52,7595.83,48.0,False
80125,6823,202412,10,10,10,4,23857.5,0,0,1,...,1,1,0,0.0,1,3,4,49590.00,15.0,False


In [9]:
df_monthly.columns

Index(['advertiser_zrive_id', 'period_int', 'monthly_published_ads',
       'monthly_unique_published_ads', 'monthly_contracted_ads',
       'monthly_leads', 'monthly_visits', 'monthly_oro_ads',
       'monthly_plata_ads', 'monthly_destacados_ads', 'monthly_pepitas_ads',
       'monthly_shows', 'monthly_total_phone_views', 'monthly_total_calls',
       'monthly_total_emails', 'monthly_total_invoice', 'monthly_unique_calls',
       'monthly_unique_emails', 'monthly_unique_leads', 'monthly_avg_ad_price',
       'monthly_distinct_ads', 'has_active_contract'],
      dtype='object')

### Feature Engineering

Use monthly as a basis... So let's clean and join into that dataframe

In [10]:
df_withdrawals_ = df_withdrawals[["advertiser_zrive_id", "withdrawal_creation_date", "churn"]]

In [18]:
df_withdrawals_

Unnamed: 0,advertiser_zrive_id,withdrawal_creation_date,churn
0,257,2012-06-19 07:12:34,1
1,219,2012-06-19 07:16:34,1
2,487,2012-06-20 07:10:16,1
3,476,2012-06-20 11:59:36,1
4,452,2012-06-20 15:41:39,1
...,...,...,...
20674,5441,2024-12-30 21:46:47,1
20675,5439,2024-12-30 21:47:47,1
20676,154,2024-12-30 22:08:19,1
20677,1352,2024-12-30 22:13:53,1


In [11]:
df_advertiser['advertiser_group'] = df_advertiser['advertiser_group_id'].notna()
df_advertiser['customer_before'] = df_advertiser['max_start_contrato_nuevo_date'].notna()

In [12]:
df_advertiser['updated_at'] = pd.to_datetime(df_advertiser['updated_at'], errors='coerce')
df_advertiser['min_start_contrato_date'] = pd.to_datetime(df_advertiser['min_start_contrato_date'], errors='coerce')
df_advertiser['contrato_churn_date'] = pd.to_datetime(df_advertiser['contrato_churn_date'], errors='coerce')

In [13]:
df_advertiser['contrato_percentage'] = 100 * (df_advertiser['updated_at'] - df_advertiser['min_start_contrato_date']).dt.days / (df_advertiser['contrato_churn_date'] - df_advertiser['min_start_contrato_date']).dt.days
df_advertiser['months_active'] = (
    (df_advertiser['updated_at'].dt.year - df_advertiser['min_start_contrato_date'].dt.year) * 12 +
    (df_advertiser['updated_at'].dt.month - df_advertiser['min_start_contrato_date'].dt.month)
)

In [14]:
df_advertiser

Unnamed: 0,advertiser_zrive_id,province_id,updated_at,advertiser_province,advertiser_group_id,min_start_contrato_date,max_start_contrato_nuevo_date,contrato_churn_date,advertiser_group,customer_before,contrato_percentage,months_active
0,6732,1,2025-02-05 01:02:08,Álava,,2025-01-24,,2025-02-04,False,False,109.090909,1
1,4841,2,2024-08-09 13:38:43,Albacete,133.0,2023-02-24,,2023-06-03,True,False,537.373737,18
2,2487,3,2025-01-13 17:41:02,Alicante,,2024-10-02,2024-10-02,2025-01-13,False,True,100.000000,3
3,1771,3,2024-11-07 13:26:50,Alicante,185.0,2021-11-15,,2024-11-06,True,False,100.091996,36
4,3396,3,2023-11-02 13:51:07,Alicante,,2023-03-17,2023-03-17,2023-11-01,False,True,100.436681,8
...,...,...,...,...,...,...,...,...,...,...,...,...
6829,6079,46,2025-02-19 14:13:04,Valencia,,2024-11-19,2024-11-19,2025-03-31,False,True,69.696970,3
6830,4775,47,2025-03-10 13:32:11,Valladolid,,2024-12-12,2024-12-12,2025-03-31,False,True,80.733945,3
6831,4712,48,2024-06-27 18:05:17,Vizcaya,41.0,2023-01-30,2023-01-30,2025-03-31,True,True,64.981037,17
6832,5686,50,2025-03-17 17:29:04,Zaragoza,,2024-02-07,2024-02-07,2025-03-31,False,True,96.650718,13


There seems to be some contracts which are not updated... 

In [16]:
df_advertiser_ = df_advertiser.drop(columns=["province_id", "advertiser_province", "advertiser_group_id", "min_start_contrato_date", "max_start_contrato_nuevo_date", "contrato_churn_date"])

In [17]:
df_advertiser_

Unnamed: 0,advertiser_zrive_id,updated_at,advertiser_group,customer_before,contrato_percentage,months_active
0,6732,2025-02-05 01:02:08,False,False,109.090909,1
1,4841,2024-08-09 13:38:43,True,False,537.373737,18
2,2487,2025-01-13 17:41:02,False,True,100.000000,3
3,1771,2024-11-07 13:26:50,True,False,100.091996,36
4,3396,2023-11-02 13:51:07,False,True,100.436681,8
...,...,...,...,...,...,...
6829,6079,2025-02-19 14:13:04,False,True,69.696970,3
6830,4775,2025-03-10 13:32:11,False,True,80.733945,3
6831,4712,2024-06-27 18:05:17,True,True,64.981037,17
6832,5686,2025-03-17 17:29:04,False,True,96.650718,13
