In [1223]:
import pandas as pd
from pathlib import Path
import numpy as np

In [1224]:
pd.set_option('display.max_columns', None)

In [1225]:
data_path = Path("../data")

In [1226]:
df_withdrawals = pd.read_parquet(data_path / "zrive_advertiser_withdrawals.parquet")
df_advertiser = pd.read_parquet(data_path / "zrive_dim_advertiser.parquet")
df_monthly = pd.read_parquet(data_path / "zrive_fct_monthly_snapshot_advertiser.parquet")

In [1227]:
# Withdrawals 
def add_churn(df: pd.DataFrame):
    CHURN_REASONS_EXCLUDED = [
        'Upselling-cambio de contrato',
        'Cambio a Bundle Online',
        'Cambio de Contrato/propuesta/producto'
    ]
    df["churn"] = (
        (df["withdrawal_type"] == "TOTAL") &
        (df["withdrawal_status"] != "Denegada") &
        (~df["withdrawal_reason"].isin(CHURN_REASONS_EXCLUDED))
    ).astype(int)
    return df

def convert_datetime_to_month_period(df, datetime_col, new_col, drop_original=True):
    df[new_col] = pd.to_datetime(df[datetime_col]).dt.to_period('M')
    if drop_original:
        df = df.drop(columns=[datetime_col])
    return df

def add_predict_month(df: pd.DataFrame, predict_col = "predict_month", withdrawal_col="withdrawal_month", n: int = 1):
    df[predict_col] = df[withdrawal_col] - n
    return df

In [1228]:
WITHDRAWAL_COLS_TO_DROP = ["withdrawal_id", "withdrawal_status", "withdrawal_type", "withdrawal_reason", "withdrawal_month"]

df_withdrawals = convert_datetime_to_month_period(
    df_withdrawals, 
    datetime_col='withdrawal_creation_date',
    new_col='withdrawal_month',
    drop_original=True
)
df_withdrawals = add_predict_month(df_withdrawals, n=1)
df_withdrawals = add_churn(df_withdrawals)
df_withdrawals.drop(columns=WITHDRAWAL_COLS_TO_DROP, inplace=True)

In [1229]:
df_withdrawals.head()

Unnamed: 0,advertiser_zrive_id,predict_month,churn
0,257,2012-05,1
1,219,2012-05,1
2,487,2012-05,1
3,476,2012-05,1
4,452,2012-05,1


In [1230]:
def convert_period_int_to_month_period(df, period_col='period_int', new_col='month_period'):
    """Convierte el formato YYYYMM a un período mensual de pandas"""
    df[new_col] = pd.to_datetime(df[period_col].astype(str) + '01', format='%Y%m%d').dt.to_period('M')
    return df

In [1231]:
df_monthly = convert_period_int_to_month_period(df_monthly)

cols = df_monthly.columns.tolist()
cols.remove('month_period')
cols.insert(2, 'month_period')
df_monthly = df_monthly[cols]

In [1232]:
df_monthly

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract
0,1,202301,2023-01,47,47,75,18,40890.0,6,6,6,0,2051941.5,14,15,0,440.8,12,3,15,,,True
1,2,202301,2023-01,31,31,150,4,17970.0,10,10,4,0,1250403.0,16,2,2,75.4,2,2,4,,,True
2,3,202301,2023-01,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,,,False
3,4,202301,2023-01,79,79,85,14,27157.5,3,3,1,0,1142673.0,10,8,2,299.6,6,6,12,,,True
4,6,202301,2023-01,20,20,20,16,79492.5,0,0,1,0,1773345.0,10,4,3,86.5,4,11,15,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80122,6800,202412,2024-12,7,7,10,0,2857.5,1,1,4,0,80052.0,0,0,0,0.0,0,0,0,8840.00,8.0,False
80123,6808,202412,2024-12,17,17,20,3,12967.5,0,0,0,0,245994.0,3,2,0,0.0,2,1,3,36487.70,28.0,False
80124,6822,202412,2024-12,18,18,35,60,74362.5,5,6,15,0,1404081.0,21,31,7,0.0,26,26,52,7595.83,48.0,False
80125,6823,202412,2024-12,10,10,10,4,23857.5,0,0,1,0,302799.0,1,1,0,0.0,1,3,4,49590.00,15.0,False


In [1233]:
def add_churn_target(df_monthly, df_withdrawals):
    """
    Añade el target de churn a df_monthly, usando lo calculado en df_withdrawals

    Args:
        df_monthly: DataFrame con datos mensuales de anunciantes
        df_withdrawals: DataFrame procesado con información de churn
        
    Returns:
        DataFrame con columna 'churn' añadida
    """

    # Eliminar duplicados en df_withdrawals (antes del merge)
    df_withdrawals = df_withdrawals.drop_duplicates(
        subset=['advertiser_zrive_id', 'predict_month'],
        keep='first'
    )

    # Realizar la unión basada en advertiser_id y el período mensual
    df_target = df_monthly.merge(
        df_withdrawals[['advertiser_zrive_id', 'predict_month', 'churn']],
        left_on=['advertiser_zrive_id', 'month_period'],
        right_on=['advertiser_zrive_id', 'predict_month'],
        how='left'
    )

    # Rellenar los valores NaN en churn con 0 (no hubo churn)
    df_target['churn'] = df_target['churn'].fillna(0)

    # Eliminar la columna redundante predict_month
    if 'predict_month' in df_target.columns:
        df_target = df_target.drop(columns=['predict_month'])

    return df_target

In [1234]:
df_target = add_churn_target(df_monthly, df_withdrawals)
df_target['churn'] = df_target['churn'].astype(int)

In [1235]:
df_target[df_target['advertiser_zrive_id'] == 1]

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,churn
0,1,202301,2023-01,47,47,75,18,40890.0,6,6,6,0,2051941.5,14,15,0,440.8,12,3,15,,,True,0
3542,1,202302,2023-02,44,44,50,10,34740.0,3,3,1,0,1761028.5,18,6,3,464.8,5,4,9,,,True,1
6905,1,202303,2023-03,39,39,50,7,11850.0,3,3,1,0,561550.5,5,4,2,464.8,3,3,6,,,True,0
10328,1,202304,2023-04,39,39,50,6,15930.0,3,3,1,0,867489.0,7,3,2,464.8,3,3,6,,,True,0
13747,1,202305,2023-05,0,0,150,8,19140.0,0,3,1,0,1038817.5,13,4,2,464.8,4,4,8,,,True,0


En el id 1 el churn ocurre en 202303, por lo tanto va a tener churn = 1 en la fila 202302.

La actividad post churn no nos interesa, la eliminaremos.

Para usuarios que tienen varios contratos y varios churns, haremos lo mismo. (posible mejora para el futuro --> detectar varias churns)

In [1236]:
def remove_activity_after_first_churn(df):
    df_sorted = df.sort_values(['advertiser_zrive_id', 'period_int'])

    first_churn = df_sorted[df_sorted['churn'] == 1].groupby('advertiser_zrive_id')['period_int'].min().reset_index()
    first_churn.rename(columns={'period_int': 'first_churn_period'}, inplace=True)

    df_with_churn_info = df_sorted.merge(
    first_churn,
    on='advertiser_zrive_id',
    how='left'
    )

    df_filtered = df_with_churn_info[
    (df_with_churn_info['first_churn_period'].isna()) |
    (df_with_churn_info['period_int'] <= df_with_churn_info['first_churn_period'])
    ]

    df_filtered = df_filtered.drop(columns = ['first_churn_period'])

    return df_filtered

In [1237]:
df_target = remove_activity_after_first_churn(df_target)

Existen usuarios que dejan de tener actividad pero no tienen churn confirmada en df_withdrawals entre 2023 y 2024 (caso 2 en el notebook data.ipynb). A continuación vemos cuantos usuarios en cada mes

In [1238]:
user_churn_status = df_target.groupby('advertiser_zrive_id')['churn'].max().reset_index()
# 1. Identificar usuarios sin churn
users_without_churn_ids = user_churn_status[user_churn_status['churn'] == 0]['advertiser_zrive_id'].tolist()

# 2. Analizar el último período registrado para cada usuario sin churn
last_period_by_user = df_target.groupby('advertiser_zrive_id')['period_int'].max().reset_index()
last_period_no_churn = last_period_by_user[last_period_by_user['advertiser_zrive_id'].isin(users_without_churn_ids)]

# 3. Distribución resumida de los últimos períodos para usuarios sin churn
latest_period = df_target['period_int'].max()
users_active_until_end = last_period_no_churn[last_period_no_churn['period_int'] == latest_period].shape[0]
early_ending_users = last_period_no_churn[last_period_no_churn['period_int'] < latest_period].shape[0]

print(f"Usuarios sin churn activos hasta el último período ({latest_period}): {users_active_until_end}")
print(f"Usuarios sin churn que terminaron antes del último período: {early_ending_users}")

# 4. Verificar el estado del contrato en el último registro para algunos usuarios que terminaron antes
early_ending_sample = last_period_no_churn[last_period_no_churn['period_int'] < latest_period]['advertiser_zrive_id'].head(5).tolist()
early_ending_details = []

for user_id in early_ending_sample:
    last_record = df_target[df_target['advertiser_zrive_id'] == user_id].sort_values('period_int').iloc[-1]
    early_ending_details.append({
        'advertiser_zrive_id': user_id,
        'last_period': last_record['period_int'],
        'has_active_contract': last_record['has_active_contract']
    })

#print("\nEjemplos de usuarios sin churn que terminaron antes:")
#print(pd.DataFrame(early_ending_details))

# 3. Distribución de los últimos períodos para usuarios sin churn
print("\n\n")
last_period_distribution = last_period_no_churn['period_int'].value_counts().sort_index()

print("Distribución de últimos períodos para usuarios sin churn:")
print(last_period_distribution)

Usuarios sin churn activos hasta el último período (202412): 2128
Usuarios sin churn que terminaron antes del último período: 585



Distribución de últimos períodos para usuarios sin churn:
period_int
202301     237
202302      85
202303      61
202304      20
202305      20
202306      11
202307      20
202308      12
202309       6
202310       8
202311      10
202312       8
202401      13
202402       6
202403       5
202404       4
202405       8
202406      10
202407       4
202408      10
202409      13
202410       8
202411       6
202412    2128
Name: count, dtype: int64


- Caso 1: Usuarios que han tenido churn a finales de 2022 y su contrato duraba hasta primer/primeros meses de 2023, ejemplo claro de esto son los 237 ids que dejan de tener actividad en 202301

- Caso 2: Usuarios que no cumplen esa condicion, ya que ocurre en meses muy posteriores a finales de 2022. Para muchos de estos casos hay una forma de saber la fecha de churn gracias a la tabla df_advertiser

In [1239]:
#Ejemplo de caso 1
df_target[df_target['advertiser_zrive_id'] == 17]

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,churn
230,17,202301,2023-01,15,15,35,3,17025.0,5,5,4,0,1265271.0,4,0,1,0.0,0,3,3,,,True,0
231,17,202302,2023-02,14,14,35,6,18750.0,5,5,4,0,1492879.5,4,1,0,0.0,1,4,5,,,True,0
232,17,202303,2023-03,0,0,35,4,4080.0,0,5,4,0,290041.5,2,1,2,0.0,1,3,4,,,True,0


In [1240]:
df_withdrawals[df_withdrawals['advertiser_zrive_id'] == 17]

Unnamed: 0,advertiser_zrive_id,predict_month,churn
11367,17,2022-11,1


Tuvo churn en diciembre de 2022, la actividad de 202301, 202302 y 202303 era la que ya estaba contratada. Estos casos los deberiamos eliminar ya que cumplen el mismo enfoque que remove_activity_after_first_churn, simplemente el churn fue a finales de 2022 y no tenemos esa fila

In [1241]:
#Ejemplo de caso 2
df_target[df_target['advertiser_zrive_id'] == 3]

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,churn
29,3,202301,2023-01,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,,,False,0
30,3,202405,2024-05,17,17,35,6,2167.5,0,0,0,0,130840.5,2,1,3,0.0,1,2,3,42864.59,17.0,True,0
31,3,202406,2024-06,17,17,35,0,1882.5,0,0,0,0,88095.0,0,0,0,0.0,0,0,0,42864.59,17.0,True,0
32,3,202407,2024-07,17,17,35,0,315.0,0,0,0,0,20601.0,0,0,0,0.0,0,0,0,42864.59,17.0,True,0


In [1242]:
df_withdrawals[df_withdrawals['advertiser_zrive_id'] == 3]

Unnamed: 0,advertiser_zrive_id,predict_month,churn
11127,3,2022-10,1


En este caso el churn es en noviembre de 2022. El usuario luego tiene un nuevo contrato que empieza en 202405, deja de tener actividad en 202407 pero no tiene churn registrada. Sin embargo, en la tabla df_advertiser si tenemos esta info para estos casos, en la columana contrato_churn_date

In [1243]:
df_advertiser[df_advertiser['advertiser_zrive_id'] == 3]

Unnamed: 0,advertiser_zrive_id,province_id,updated_at,advertiser_province,advertiser_group_id,min_start_contrato_date,max_start_contrato_nuevo_date,contrato_churn_date
21,3,8,2024-08-07 13:37:09,Barcelona,41.0,2024-05-06,,2024-07-15


In [1244]:
# otro ejemplo de caso 2
df_target[df_target['advertiser_zrive_id'] == 6288]

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,churn
78770,6288,202409,2024-09,10,10,10,1,5812.5,3,3,0,0,191163.0,3,1,0,19.6,1,0,1,17863.64,11.0,True,0
78771,6288,202410,2024-10,10,10,10,5,11242.5,3,3,0,0,382504.5,4,2,1,28.3,2,2,4,17366.67,12.0,True,0
78772,6288,202411,2024-11,10,10,10,7,9840.0,0,3,0,0,260473.5,9,1,5,28.3,1,5,6,17228.57,13.0,True,0


In [1245]:
df_withdrawals[df_withdrawals['advertiser_zrive_id'] == 6288]

Unnamed: 0,advertiser_zrive_id,predict_month,churn
19286,6288,2024-08,1
19423,6288,2024-08,1


No queda la churn registrada en df_withdrawals, pero en la columna contrato_churn_date de df_advertiser si

In [1246]:
df_advertiser[df_advertiser['advertiser_zrive_id'] == 6288]

Unnamed: 0,advertiser_zrive_id,province_id,updated_at,advertiser_province,advertiser_group_id,min_start_contrato_date,max_start_contrato_nuevo_date,contrato_churn_date
6092,6288,7,2024-12-01 05:36:10,Badajoz,,2024-09-06,2024-09-06,2024-11-30


La idea es: 
- Para estos id que no tienen churn pero dejan de tener actividad, buscar en la columna contrato_churn_date de la tabla df_advertiser y añadir esta info (mismo formato de antes, tiene que tener churn = 1 el mes previo a la fecha real)

- Una vez hecho este enfoque, lo que sigan teniendo churn=0 y dejen de tener actividad los eliminaremos

In [1247]:
def add_churn_from_advertiser_data(df_target, df_advertiser):
    """
    Añade columna de churn desde df_advertiser
    """

    users_without_churn = df_target.groupby('advertiser_zrive_id')['churn'].max()
    users_without_churn = users_without_churn[users_without_churn == 0].index.tolist()

    churn_info = df_advertiser[
        (df_advertiser['advertiser_zrive_id'].isin(users_without_churn)) &
        (df_advertiser['contrato_churn_date'].notna())
    ][['advertiser_zrive_id', 'contrato_churn_date']]

    if churn_info.empty:
        print("No se encontraron fechas de churn adicionales en df_advertiser")
        return df_target
    
    churn_info['churn_month'] = pd.to_datetime(churn_info['contrato_churn_date']).dt.to_period('M')

    #para cada usuario con fecha de churn, encontrar el mes anterior y marcar churn=1
    updates = []
    for _, row in churn_info.iterrows():
        user_id = row['advertiser_zrive_id']
        churn_month = row['churn_month']

        churn_month_int = int(churn_month.strftime('%Y%m'))

        previous_month = pd.Period(churn_month) - 1
        previous_month_int = int(previous_month.strftime('%Y%m'))

        #verificar si existe registro para ese usuario en el mes anterior
        user_data = df_target[df_target['advertiser_zrive_id'] == user_id]
        if previous_month_int in user_data['period_int'].values:
            updates.append((user_id, previous_month_int))

    #aplicar las actualizaciones
    for user_id, period in updates:
        df_target.loc[
            (df_target['advertiser_zrive_id'] == user_id) &
            (df_target['period_int'] == period),
            'churn'
        ] = 1

    print(f"Se actualizaron {len(updates)} registros con información de churn desde df_advertiser")

    df_target = remove_activity_after_first_churn(df_target)

    return df_target

In [1248]:
df_target = add_churn_from_advertiser_data(df_target, df_advertiser)

Se actualizaron 402 registros con información de churn desde df_advertiser


In [1249]:
user_churn_status = df_target.groupby('advertiser_zrive_id')['churn'].max().reset_index()
# 1. Identificar usuarios sin churn
users_without_churn_ids = user_churn_status[user_churn_status['churn'] == 0]['advertiser_zrive_id'].tolist()

# 2. Analizar el último período registrado para cada usuario sin churn
last_period_by_user = df_target.groupby('advertiser_zrive_id')['period_int'].max().reset_index()
last_period_no_churn = last_period_by_user[last_period_by_user['advertiser_zrive_id'].isin(users_without_churn_ids)]

# 3. Distribución resumida de los últimos períodos para usuarios sin churn
latest_period = df_target['period_int'].max()
users_active_until_end = last_period_no_churn[last_period_no_churn['period_int'] == latest_period].shape[0]
early_ending_users = last_period_no_churn[last_period_no_churn['period_int'] < latest_period].shape[0]

print(f"Usuarios sin churn activos hasta el último período ({latest_period}): {users_active_until_end}")
print(f"Usuarios sin churn que terminaron antes del último período: {early_ending_users}")

# 4. Verificar el estado del contrato en el último registro para algunos usuarios que terminaron antes
early_ending_sample = last_period_no_churn[last_period_no_churn['period_int'] < latest_period]['advertiser_zrive_id'].head(5).tolist()
early_ending_details = []

for user_id in early_ending_sample:
    last_record = df_target[df_target['advertiser_zrive_id'] == user_id].sort_values('period_int').iloc[-1]
    early_ending_details.append({
        'advertiser_zrive_id': user_id,
        'last_period': last_record['period_int'],
        'has_active_contract': last_record['has_active_contract']
    })

#print("\nEjemplos de usuarios sin churn que terminaron antes:")
#print(pd.DataFrame(early_ending_details))

# 3. Distribución de los últimos períodos para usuarios sin churn
print("\n\n")
last_period_distribution = last_period_no_churn['period_int'].value_counts().sort_index()

print("Distribución de últimos períodos para usuarios sin churn:")
print(last_period_distribution)

Usuarios sin churn activos hasta el último período (202412): 2038
Usuarios sin churn que terminaron antes del último período: 273



Distribución de últimos períodos para usuarios sin churn:
period_int
202301     235
202302      10
202303       1
202304       1
202306       3
202307       3
202310       1
202312       1
202401       4
202402       2
202405       3
202406       3
202408       2
202409       3
202410       1
202412    2038
Name: count, dtype: int64


La mayoria de casos que quedan son de la hipotesis de caso 1. Vamos a eliminar esos usuarios y los restantes (suponemos que son fallos en los datos)

In [1250]:
def remove_incomplete_users(df_target, latest_period=None):
    """
    Elimina usuarios que no tienen churn registrado y terminan antes del último período.
    
    """
    if latest_period is None:
        latest_period = df_target['period_int'].max()
    
    # Identificar usuarios sin churn
    users_without_churn = df_target.groupby('advertiser_zrive_id')['churn'].max()
    users_without_churn = users_without_churn[users_without_churn == 0].index.tolist()
    
    # Para cada usuario sin churn, obtener su último período registrado
    last_period_by_user = df_target.groupby('advertiser_zrive_id')['period_int'].max().reset_index()
    
    # Identificar usuarios sin churn que terminan antes del último período
    users_to_remove = last_period_by_user[
        (last_period_by_user['advertiser_zrive_id'].isin(users_without_churn)) & 
        (last_period_by_user['period_int'] < latest_period)
    ]['advertiser_zrive_id'].tolist()
    
    # Filtrar el DataFrame para eliminar estos usuarios
    df_filtered = df_target[~df_target['advertiser_zrive_id'].isin(users_to_remove)]
    
    print(f"Se eliminaron {len(users_to_remove)} usuarios sin churn completo")
    
    return df_filtered

In [1251]:
df_target = remove_incomplete_users(df_target)

Se eliminaron 273 usuarios sin churn completo


In [1252]:
df_target[df_target['advertiser_zrive_id'] == 3]

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,churn
26,3,202301,2023-01,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,,,False,0
27,3,202405,2024-05,17,17,35,6,2167.5,0,0,0,0,130840.5,2,1,3,0.0,1,2,3,42864.59,17.0,True,0
28,3,202406,2024-06,17,17,35,0,1882.5,0,0,0,0,88095.0,0,0,0,0.0,0,0,0,42864.59,17.0,True,1


Ya tenemos info de churn en estos usuarios.

Siguiente limpieza --> Eliminar filas con has_active_contract en False y (monthly_published_ads y monthly_unique_published_ads) en 0. Las que tengan has_active_contract en False pero actividad de momento las dejamos. Mensaje de Andres --> ***priorizar como anunciantes activos aquellos que tengan actividad en determinados meses aunque figure que no tienen ningún contrato activo***

In [1253]:
def remove_inactive_periods_without_contract(df):
    """
    Elimina filas donde has_active_contract=False y no hay anuncios publicados.
    
    Args:
        df: DataFrame con datos mensuales de anunciantes
        
    Returns:
        DataFrame filtrado
    """
    # Contar filas antes de la limpieza
    rows_before = df.shape[0]
    
    # Filtrar filas donde has_active_contract=False y no hay anuncios publicados
    df_filtered = df[~(
        (df['has_active_contract'] == False) & 
        (df['monthly_published_ads'] == 0) & 
        (df['monthly_unique_published_ads'] == 0)
    )]
    
    # Contar filas eliminadas
    rows_removed = rows_before - df_filtered.shape[0]
    
    print(f"Se eliminaron {rows_removed} filas sin contrato activo y sin actividad")
    
    return df_filtered

In [1254]:
df_target = remove_inactive_periods_without_contract(df_target)

Se eliminaron 48 filas sin contrato activo y sin actividad


In [1255]:
df_target[df_target['advertiser_zrive_id'] == 3]

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,churn
27,3,202405,2024-05,17,17,35,6,2167.5,0,0,0,0,130840.5,2,1,3,0.0,1,2,3,42864.59,17.0,True,0
28,3,202406,2024-06,17,17,35,0,1882.5,0,0,0,0,88095.0,0,0,0,0.0,0,0,0,42864.59,17.0,True,1


In [1256]:
df_target.shape

(56352, 24)

### Parte 2: Nuevas columnas

El objetivo es analizar los datos y añadir nuevas features

1. Calcular media y mediana de periodos por id 

In [1257]:
user_churn_status = df_target.groupby('advertiser_zrive_id')['churn'].max().reset_index()
users_with_churn = user_churn_status[user_churn_status['churn'] == 1]
users_without_churn = user_churn_status[user_churn_status['churn'] == 0]

In [1258]:
user_periods = df_target.groupby('advertiser_zrive_id').size().reset_index(name='num_periods')
avg_periods_all = user_periods['num_periods'].mean()
print(f"numero medio de periodos por id: {avg_periods_all}")
median_periods_all = user_periods['num_periods'].median()
print(f"mediana de periodos por id: {median_periods_all}")

numero medio de periodos por id: 9.000479156684236
mediana de periodos por id: 5.0


In [1259]:
churn_users_ids = users_with_churn['advertiser_zrive_id'].tolist()
avg_periods_churn = user_periods[user_periods['advertiser_zrive_id'].isin(churn_users_ids)]['num_periods'].mean()
print(f"numero medio de periodos por id que churnea: {avg_periods_churn}")
median_periods_churn = user_periods[user_periods['advertiser_zrive_id'].isin(churn_users_ids)]['num_periods'].median()
print(f"mediana de periodos por id que churnea: {median_periods_churn}")

numero medio de periodos por id que churnea: 5.739048070092352
mediana de periodos por id que churnea: 4.0


In [1260]:
no_churn_users_ids = users_without_churn['advertiser_zrive_id'].tolist()
avg_periods_no_churn = user_periods[user_periods['advertiser_zrive_id'].isin(no_churn_users_ids)]['num_periods'].mean()
print(f"numero medio de periodos por id que no churnea: {avg_periods_no_churn}")
median_periods_no_churn = user_periods[user_periods['advertiser_zrive_id'].isin(no_churn_users_ids)]['num_periods'].median()
print(f"mediana de periodos por id que no churnea: {median_periods_no_churn}")

numero medio de periodos por id que no churnea: 15.758586849852797
mediana de periodos por id que no churnea: 24.0


2. Tasa de churn por mes

In [1261]:
churn_by_period = df_target.groupby('period_int')['churn'].mean().reset_index()
churn_by_period = churn_by_period.sort_values('period_int')
 
print("\nTasa de churn por período:")
for _, row in churn_by_period.iterrows():
     period = row['period_int']
     churn_rate = row['churn']
     print(f"- {period}: {churn_rate:.2%}")


Tasa de churn por período:
- 202301.0: 11.09%
- 202302.0: 10.89%
- 202303.0: 9.54%
- 202304.0: 9.10%
- 202305.0: 8.07%
- 202306.0: 6.75%
- 202307.0: 6.87%
- 202308.0: 7.29%
- 202309.0: 8.21%
- 202310.0: 7.38%
- 202311.0: 6.13%
- 202312.0: 6.10%
- 202401.0: 8.20%
- 202402.0: 5.54%
- 202403.0: 7.48%
- 202404.0: 6.28%
- 202405.0: 8.01%
- 202406.0: 5.52%
- 202407.0: 4.56%
- 202408.0: 8.85%
- 202409.0: 5.98%
- 202410.0: 7.25%
- 202411.0: 9.09%
- 202412.0: 1.45%


3. Nuevas features

In [1262]:
def create_derived_features(df):
    """ 
    Create new features derived from the existing ones
    """

    df_features = df.copy()

    #anuncios publicados / anuncios contratados
    df_features['ratio_published_contracted'] = df_features.apply(
        lambda x: x['monthly_published_ads'] / x['monthly_contracted_ads'] if x['monthly_contracted_ads'] > 0 else 0,
        axis=1
    )

    #ratio de anuncios unicos / anuncios publicados
    df_features['ratio_unique_published'] = df_features.apply(
        lambda x: x['monthly_unique_published_ads'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    #ratio de anuncios distintos / anuncios publicados
    df_features['ratio_distinct_published'] = df_features.apply(
        lambda x: x['monthly_distinct_ads'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    #ratios de engagement
    #leads / anuncios publicados
    df_features['leads_per_published_ad'] = df_features.apply(
        lambda x: x['monthly_leads'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    df_features['visits_per_published_ad'] = df_features.apply(
        lambda x: x['monthly_visits'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )


    #leads/visitas
    df_features['leads_per_visit'] = df_features.apply(
        lambda x: x['monthly_leads'] / x['monthly_visits'] if x['monthly_visits'] > 0 else 0,
        axis=1
    )

    #leads / shows
    df_features['leads_per_shows'] = df_features.apply(
        lambda x: x['monthly_leads'] / x['monthly_shows'] if x['monthly_shows'] > 0 else 0,
        axis=1
    )

    # llamadas por anuncio publicado
    df_features['calls_per_published_ads'] = df_features.apply(
        lambda x: x['monthly_total_calls'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    #ratios de anuncios premium
    #oro ads
    df_features['pct_oro_ads'] = df_features.apply(
        lambda x: x['monthly_oro_ads'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    #plata ads
    df_features['pct_plata_ads'] = df_features.apply(
        lambda x: x['monthly_plata_ads'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    #destacados ads
    df_features['pct_destacados_ads'] = df_features.apply(
        lambda x: x['monthly_destacados_ads'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    #pepitas ads
    df_features['pct_pepitas_ads'] = df_features.apply(
        lambda x: x['monthly_pepitas_ads'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0,
        axis=1
    )

    #premium sum ads
    df_features['pct_premium_ads'] = df_features.apply(
        lambda x: (x['monthly_oro_ads'] + x['monthly_plata_ads'] + x['monthly_destacados_ads'] + x['monthly_pepitas_ads']) / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0, 
        axis=1
    )

    #ratios economicos
    #facturacion por anuncio publicado
    df_features['invoice_per_published_ad'] = df_features.apply(
        lambda x: x['monthly_total_invoice'] / x['monthly_published_ads'] if x['monthly_published_ads'] > 0 else 0, 
        axis=1
    )

    # facturación por lead
    df_features['invoice_per_lead'] = df_features.apply(
        lambda x: x['monthly_total_invoice'] / x['monthly_leads'] if x['monthly_leads'] > 0 else 0, 
        axis=1
    )


    #eficiencia de anuncios premium
    #leads por anuncio oro
    df_features['leads_per_oro_ad'] = df_features.apply(
        lambda x: x['monthly_leads'] / x['monthly_oro_ads'] if x['monthly_oro_ads'] > 0 else 0, 
        axis=1
    )   

    #leads por anuncio plata
    df_features['leads_per_plata_ad'] = df_features.apply(
        lambda x: x['monthly_leads'] / x['monthly_plata_ads'] if x['monthly_plata_ads'] > 0 else 0, 
        axis=1
    )

    #leads por anuncio destacado
    df_features['leads_per_destacados_ad'] = df_features.apply(
        lambda x: x['monthly_leads'] / x['monthly_destacados_ads'] if x['monthly_destacados_ads'] > 0 else 0, 
        axis=1
    ) 

    return df_features



In [1263]:
#añadir features a df_target
df_target = create_derived_features(df_target)

#reordenar columnas para que churn sea la ultima
cols = df_target.columns.tolist()
cols.remove('churn')
cols.append('churn')
df_target = df_target[cols]

In [1264]:
df_target

Unnamed: 0,advertiser_zrive_id,period_int,month_period,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,ratio_published_contracted,ratio_unique_published,ratio_distinct_published,leads_per_published_ad,visits_per_published_ad,leads_per_visit,leads_per_shows,calls_per_published_ads,pct_oro_ads,pct_plata_ads,pct_destacados_ads,pct_pepitas_ads,pct_premium_ads,invoice_per_published_ad,invoice_per_lead,leads_per_oro_ad,leads_per_plata_ad,leads_per_destacados_ad,churn
0,1,202301,2023-01,47,47,75,18,40890.0,6,6,6,0,2051941.5,14,15,0,440.8,12,3,15,,,True,0.626667,1.0,,0.382979,870.000000,0.000440,0.000009,0.319149,0.127660,0.127660,0.127660,0.0,0.382979,9.378723,24.488889,3.000000,3.000000,3.0,0
1,1,202302,2023-02,44,44,50,10,34740.0,3,3,1,0,1761028.5,18,6,3,464.8,5,4,9,,,True,0.880000,1.0,,0.227273,789.545455,0.000288,0.000006,0.136364,0.068182,0.068182,0.022727,0.0,0.159091,10.563636,46.480000,3.333333,3.333333,10.0,1
2,2,202301,2023-01,31,31,150,4,17970.0,10,10,4,0,1250403.0,16,2,2,75.4,2,2,4,,,True,0.206667,1.0,,0.129032,579.677419,0.000223,0.000003,0.064516,0.322581,0.322581,0.129032,0.0,0.774194,2.432258,18.850000,0.400000,0.400000,1.0,0
3,2,202302,2023-02,31,31,150,4,10695.0,10,10,4,0,1026553.5,10,2,1,54.9,2,2,4,,,True,0.206667,1.0,,0.129032,345.000000,0.000374,0.000004,0.064516,0.322581,0.322581,0.129032,0.0,0.774194,1.770968,13.725000,0.400000,0.400000,1.0,0
4,2,202303,2023-03,47,47,150,6,7845.0,10,10,4,0,688947.0,16,5,1,34.3,4,1,5,,,True,0.313333,1.0,,0.127660,166.914894,0.000765,0.000009,0.106383,0.212766,0.212766,0.085106,0.0,0.510638,0.729787,5.716667,0.600000,0.600000,1.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57157,6824,202408,2024-08,8,8,10,0,3232.5,0,0,0,0,30628.5,0,0,0,0.0,0,0,0,29962.00,10.0,False,0.800000,1.0,1.250000,0.000000,404.062500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0
57158,6824,202409,2024-09,7,7,10,0,1980.0,0,0,0,0,20244.0,0,0,0,0.0,0,0,0,30990.00,8.0,False,0.700000,1.0,1.142857,0.000000,282.857143,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0
57159,6824,202410,2024-10,6,6,10,0,2070.0,0,0,0,0,19299.0,0,0,0,0.0,0,0,0,33002.86,7.0,False,0.600000,1.0,1.166667,0.000000,345.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0
57160,6824,202411,2024-11,6,6,10,1,2055.0,0,0,0,0,20506.5,0,0,0,0.0,0,1,1,38160.00,7.0,False,0.600000,1.0,1.166667,0.166667,342.500000,0.000487,0.000049,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0


4. Añadir informacion de meses previos

In [1265]:
#columna de meses activo por advertiser_zrive_id

df_target['months_active'] = 1
df_target['months_active'] = df_target.groupby('advertiser_zrive_id')['months_active'].cumsum()

#mostrar la columna despues de month_period
cols = df_target.columns.tolist()
cols.remove('months_active')
cols.insert(3, 'months_active')
df_target = df_target[cols]
df_target.head(10)

Unnamed: 0,advertiser_zrive_id,period_int,month_period,months_active,monthly_published_ads,monthly_unique_published_ads,monthly_contracted_ads,monthly_leads,monthly_visits,monthly_oro_ads,monthly_plata_ads,monthly_destacados_ads,monthly_pepitas_ads,monthly_shows,monthly_total_phone_views,monthly_total_calls,monthly_total_emails,monthly_total_invoice,monthly_unique_calls,monthly_unique_emails,monthly_unique_leads,monthly_avg_ad_price,monthly_distinct_ads,has_active_contract,ratio_published_contracted,ratio_unique_published,ratio_distinct_published,leads_per_published_ad,visits_per_published_ad,leads_per_visit,leads_per_shows,calls_per_published_ads,pct_oro_ads,pct_plata_ads,pct_destacados_ads,pct_pepitas_ads,pct_premium_ads,invoice_per_published_ad,invoice_per_lead,leads_per_oro_ad,leads_per_plata_ad,leads_per_destacados_ad,churn
0,1,202301,2023-01,1,47,47,75,18,40890.0,6,6,6,0,2051941.5,14,15,0,440.8,12,3,15,,,True,0.626667,1.0,,0.382979,870.0,0.00044,9e-06,0.319149,0.12766,0.12766,0.12766,0.0,0.382979,9.378723,24.488889,3.0,3.0,3.0,0
1,1,202302,2023-02,2,44,44,50,10,34740.0,3,3,1,0,1761028.5,18,6,3,464.8,5,4,9,,,True,0.88,1.0,,0.227273,789.545455,0.000288,6e-06,0.136364,0.068182,0.068182,0.022727,0.0,0.159091,10.563636,46.48,3.333333,3.333333,10.0,1
2,2,202301,2023-01,1,31,31,150,4,17970.0,10,10,4,0,1250403.0,16,2,2,75.4,2,2,4,,,True,0.206667,1.0,,0.129032,579.677419,0.000223,3e-06,0.064516,0.322581,0.322581,0.129032,0.0,0.774194,2.432258,18.85,0.4,0.4,1.0,0
3,2,202302,2023-02,2,31,31,150,4,10695.0,10,10,4,0,1026553.5,10,2,1,54.9,2,2,4,,,True,0.206667,1.0,,0.129032,345.0,0.000374,4e-06,0.064516,0.322581,0.322581,0.129032,0.0,0.774194,1.770968,13.725,0.4,0.4,1.0,0
4,2,202303,2023-03,3,47,47,150,6,7845.0,10,10,4,0,688947.0,16,5,1,34.3,4,1,5,,,True,0.313333,1.0,,0.12766,166.914894,0.000765,9e-06,0.106383,0.212766,0.212766,0.085106,0.0,0.510638,0.729787,5.716667,0.6,0.6,1.5,0
5,2,202304,2023-04,4,64,64,150,4,13275.0,10,10,4,0,1104043.5,17,1,2,61.7,1,3,4,,,True,0.426667,1.0,,0.0625,207.421875,0.000301,4e-06,0.015625,0.15625,0.15625,0.0625,0.0,0.375,0.964063,15.425,0.4,0.4,1.0,0
6,2,202305,2023-05,5,79,79,150,16,16342.5,10,10,4,0,1044193.5,23,11,3,34.3,6,5,11,,1.0,True,0.526667,1.0,0.012658,0.202532,206.867089,0.000979,1.5e-05,0.139241,0.126582,0.126582,0.050633,0.0,0.303797,0.434177,2.14375,1.6,1.6,4.0,0
7,2,202306,2023-06,6,76,76,150,12,13867.5,10,10,4,0,781903.5,19,5,7,0.0,5,3,8,15990.0,1.0,True,0.506667,1.0,0.013158,0.157895,182.467105,0.000865,1.5e-05,0.065789,0.131579,0.131579,0.052632,0.0,0.315789,0.0,0.0,1.2,1.2,3.0,0
8,2,202307,2023-07,7,71,71,150,7,11325.0,10,10,4,0,640752.0,15,2,5,6.9,2,4,6,19740.0,2.0,True,0.473333,1.0,0.028169,0.098592,159.507042,0.000618,1.1e-05,0.028169,0.140845,0.140845,0.056338,0.0,0.338028,0.097183,0.985714,0.7,0.7,1.75,0
9,2,202308,2023-08,8,78,78,150,6,8640.0,10,10,4,0,538965.0,21,4,2,6.9,4,2,6,19990.0,5.0,True,0.52,1.0,0.064103,0.076923,110.769231,0.000694,1.1e-05,0.051282,0.128205,0.128205,0.051282,0.0,0.307692,0.088462,1.15,0.6,0.6,1.5,0


In [1266]:
# gestionar columnas con NaNs (provisional)
df_target['monthly_distinct_ads'] = df_target['monthly_distinct_ads'].fillna(0)
df_target = df_target.drop(columns=['monthly_avg_ad_price'])
df_target['ratio_distinct_published'] = df_target['ratio_distinct_published'].fillna(0) # entender si tiene sentido este ratio

In [1267]:
df_target.isnull().sum()

advertiser_zrive_id             0
period_int                      0
month_period                    0
months_active                   0
monthly_published_ads           0
monthly_unique_published_ads    0
monthly_contracted_ads          0
monthly_leads                   0
monthly_visits                  0
monthly_oro_ads                 0
monthly_plata_ads               0
monthly_destacados_ads          0
monthly_pepitas_ads             0
monthly_shows                   0
monthly_total_phone_views       0
monthly_total_calls             0
monthly_total_emails            0
monthly_total_invoice           0
monthly_unique_calls            0
monthly_unique_emails           0
monthly_unique_leads            0
monthly_distinct_ads            0
has_active_contract             0
ratio_published_contracted      0
ratio_unique_published          0
ratio_distinct_published        0
leads_per_published_ad          0
visits_per_published_ad         0
leads_per_visit                 0
leads_per_show

5. Añadir informacion de meses previos, lo que nos permitira aprender tendencias de subidas o bajadas de features que dan lugar a churn.


    - Se añade nuevas columnas con la media de los 2 meses anteriores para las variables que queramos

    - EL problema es que en el primer mes de informacion no hay informacion de meses previos y no podemos calcular la media

    -   Una opcion es añadir una columna que sea has_2m_history, que sea =1 cuando tiene 2 meses previos y 0 cuando no, asi el modelo tiene esta informacion disponible

        -   Si no tiene informacion de meses previos, la media sera el valor de ese mes

        -   Si solo tiene un mes de informacion previa, sera el valor del mes previo

        -   Si tiene mas de un mes de informacion previa, hace la media de los 2 meses previos

Añadir columna que nos dice para cada id, en cada mes si tiene informacion de 2 meses previos

In [1268]:
df_target = df_target.sort_values(['advertiser_zrive_id', 'month_period'])

df_target['has_2m_history'] = (
    df_target.groupby('advertiser_zrive_id')['month_period']
    .transform(lambda x: (x.expanding().count() - 1 >= 2).astype(int)
))

Ejemplo

In [1269]:
df_target[['advertiser_zrive_id', 'month_period', 'months_active', 'has_2m_history']].head(10)

Unnamed: 0,advertiser_zrive_id,month_period,months_active,has_2m_history
0,1,2023-01,1,0
1,1,2023-02,2,0
2,2,2023-01,1,0
3,2,2023-02,2,0
4,2,2023-03,3,1
5,2,2023-04,4,1
6,2,2023-05,5,1
7,2,2023-06,6,1
8,2,2023-07,7,1
9,2,2023-08,8,1


Añadir 2 columas: media en los 2 ultimos meses y tendencia (1 si valor de mes actual es > que el valor medio de los ultimos 2 meses, -1 si es < y 0 si es el mismo)

In [1271]:
def add_rolling_avg_columns(df, columns_to_roll, window_size=2):
    df = df.sort_values(['advertiser_zrive_id', 'month_period'])
    
    for col in columns_to_roll:
        # 1. Redondear la columna original
        df[col] = df[col].round(2)
        
        # 2. Calcular media móvil de meses previos (redondeada)
        new_avg_col = f'avg_previous_{window_size}m_{col}'
        df[new_avg_col] = np.nan
        
        # 3. Calcular tendencia (vs promedio previo)
        new_trend_col = f'trend_{col}'
        df[new_trend_col] = 0  # Valor por defecto
        
        for client, group in df.groupby('advertiser_zrive_id'):
            # -- Media móvil --
            for i in range(1, len(group)):
                start_idx = max(0, i - window_size)
                prev_values = group.iloc[start_idx:i][col]
                df.loc[group.index[i], new_avg_col] = prev_values.mean().round(2)
            df.loc[group.index[0], new_avg_col] = group.iloc[0][col].round(2)
            
            # -- Tendencia: valor actual vs promedio previo --
            for i in range(1, len(group)):
                current = group.iloc[i][col]
                avg_prev = df.loc[group.index[i], new_avg_col]  # Usamos la media ya calculada
                
                if current > avg_prev:
                    trend = 1
                elif current < avg_prev:
                    trend = -1
                else:
                    trend = 0
                
                df.loc[group.index[i], new_trend_col] = int(trend)
    
    return df

Ejemplo: Calculamos esta nueva info para la columna leads_per_published_ad

In [1272]:
columns_to_calculate = [
    'leads_per_published_ad'
]

In [1273]:
df_target = add_rolling_avg_columns(df_target, columns_to_calculate)

Visualizar exactamente estas nuevas columnas

In [1274]:
df_target[['advertiser_zrive_id', 'month_period', 'months_active', 'leads_per_published_ad', 'has_2m_history', 'avg_previous_2m_leads_per_published_ad', 'trend_leads_per_published_ad','churn']].head(20)

Unnamed: 0,advertiser_zrive_id,month_period,months_active,leads_per_published_ad,has_2m_history,avg_previous_2m_leads_per_published_ad,trend_leads_per_published_ad,churn
0,1,2023-01,1,0.38,0,0.38,0,0
1,1,2023-02,2,0.23,0,0.38,-1,1
2,2,2023-01,1,0.13,0,0.13,0,0
3,2,2023-02,2,0.13,0,0.13,0,0
4,2,2023-03,3,0.13,1,0.13,0,0
5,2,2023-04,4,0.06,1,0.13,-1,0
6,2,2023-05,5,0.2,1,0.1,1,0
7,2,2023-06,6,0.16,1,0.13,1,0
8,2,2023-07,7,0.1,1,0.18,-1,0
9,2,2023-08,8,0.08,1,0.13,-1,0


Explicación:

- advertiser_zrive_id == 1 
    -   en la fila 2023-01 no tiene 2 meses previos por lo que has_2m_history es = 0 y avg_previous_2m_leads_per_published_ad es el valor del mes actual, trend_leads_per_published_ad = 0 porque la tendencia no es ni positiva ni negativa

    -   en la fila fila 2023-02 no tiene 2 meses previos por lo que has_2m_history es = 0 y avg_previous_2m_leads_per_published_ad es el valor del mes previo, trend_leads_per_published_ad = -1 la tendencia es negativa ya que en este mes tenemos menos leads que en el anterior


- advertiser_zrive_id == 2 

    -   filas 2023-01 y 2023-02 igual que advertiser_zrive_id == 1 

    -   en la fila fila 2023-03 tiene 2 meses previos por lo que has_2m_history es = 1 y avg_previous_2m_leads_per_published_ad es la media de leads_per_published_ad en los 2 meses previos ((0.13+0.13) / 2). trend_leads_per_published_ad = 0 porque tenemos mismos leads_per_published_ads este mes que los 2 meses previos

    - el resto de filas igual

Podemos calcular estos valores para las columnas que indiquemos en 

In [1275]:
columns_to_calculate = [
    '',
    '',
    '',
    ''
]