In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from operator import attrgetter

path = Path("../data")

FEATURE ENGINEERING

- create_time_features(): creates tenure, months_since_last_contract, has_renewed. As an extra, a feature with the duration of the current contract could be created.

- create_ratios(): creates ratios of various features.

- create_agg_stats(): creates means, stds, mins, maxes. For the first row of each advertiser it just takes the value of that row. As there are many possible combinations, perhaps it is better to just make a few of these.


In [2]:
advertisers = pd.read_parquet(path / "zrive_dim_advertiser.parquet")
df = pd.read_parquet(path / "processed_data.parquet")

In [3]:
df.columns

Index(['advertiser_zrive_id', 'period_int', 'monthly_published_ads',
       'monthly_unique_published_ads', 'monthly_contracted_ads',
       'monthly_leads', 'monthly_visits', 'monthly_oro_ads',
       'monthly_plata_ads', 'monthly_destacados_ads', 'monthly_pepitas_ads',
       'monthly_shows', 'monthly_total_phone_views', 'monthly_total_calls',
       'monthly_total_emails', 'monthly_total_invoice', 'monthly_unique_calls',
       'monthly_unique_emails', 'monthly_unique_leads', 'monthly_avg_ad_price',
       'monthly_distinct_ads', 'has_active_contract', 'month_period', 'churn'],
      dtype='object')

In [4]:
advertisers.columns

Index(['advertiser_zrive_id', 'province_id', 'updated_at',
       'advertiser_province', 'advertiser_group_id', 'min_start_contrato_date',
       'max_start_contrato_nuevo_date', 'contrato_churn_date'],
      dtype='object')

TIME FEATURES

In [5]:
df_sorted = df.sort_values(by=['advertiser_zrive_id', 'month_period'])
df_sorted['prev_active'] = df_sorted.groupby('advertiser_zrive_id')['has_active_contract'].shift(1)
condition = (df_sorted['has_active_contract'] == False) & (df_sorted['prev_active'] == True)
advertisers_to_fix = df_sorted.loc[condition, 'advertiser_zrive_id'].unique()

advertisers_to_fix

array([ 311,  801, 1844, 2289, 2657, 2819, 2874, 3082, 3732, 4201, 4804,
       5480])

In [6]:
df[df.advertiser_zrive_id==3732].sort_values(by='month_period')[['advertiser_zrive_id'	,'period_int','has_active_contract']]

Unnamed: 0,advertiser_zrive_id,period_int,has_active_contract
2748,3732,202301,True
5929,3732,202302,True
8856,3732,202303,True
11656,3732,202304,False
35134,3732,202402,True
37317,3732,202403,True
39491,3732,202404,True
41634,3732,202405,True
43764,3732,202406,True
45946,3732,202407,True


These "False" values are most likely typos, better to change them

In [7]:
df_sorted.loc[df_sorted['advertiser_zrive_id'].isin(advertisers_to_fix), 'has_active_contract'] = True

df_sorted = df_sorted.drop(columns='prev_active')
df = df_sorted

In [None]:
def create_time_features(df,advertisers):
    '''
    Adds the following features to the dataset: 
    tenure, months_since_last_contract, has_renewed

    Assume first month with activity as the start date in case there is activity before the first contract
    '''

    df_time_features = df.merge(advertisers, on='advertiser_zrive_id', how='left')
    
    #Convert to period in months
    df_time_features['min_start_contrato_date'] = pd.to_datetime(df_time_features['min_start_contrato_date'], errors='coerce').dt.to_period('M')
    df_time_features['max_start_contrato_nuevo_date'] = pd.to_datetime(df_time_features['max_start_contrato_nuevo_date'], errors='coerce').dt.to_period('M')
    df_time_features['contrato_churn_date'] = pd.to_datetime(df_time_features['contrato_churn_date'], errors='coerce').dt.to_period('M')
    
    #Take first month with activity as the start date in case there is activity before the first contract
    first_activity_date = df_time_features[~df_time_features['has_active_contract']].groupby('advertiser_zrive_id')['month_period'].min()
    df_time_features['first_activity_date'] = df_time_features['advertiser_zrive_id'].map(first_activity_date).fillna(df_time_features['min_start_contrato_date'])

    current_date = df_time_features['month_period']
    start_date =  df_time_features['first_activity_date']
    new_start_date = df_time_features['max_start_contrato_nuevo_date']
    end_date = df_time_features['contrato_churn_date']

    #Compute features
    df_time_features['tenure'] = (current_date - start_date).apply(lambda x: x.n if pd.notnull(x) else None).apply(lambda x: None if x < 0 else x)
    df_time_features['months_since_last_contract'] = (current_date - new_start_date).apply(lambda x: x.n if pd.notnull(x) else None).apply(lambda x: None if x < 0 else x)
    df_time_features['has_renewed'] = df_time_features['months_since_last_contract'].notna().astype(int)
    df_time_features['months_since_last_contract'] =  df_time_features['months_since_last_contract'].fillna(df_time_features['tenure'])

    df_time_features = df_time_features.drop(['province_id', 'updated_at',
       'advertiser_province', 'advertiser_group_id', 'min_start_contrato_date',
       'max_start_contrato_nuevo_date', 'has_active_contract', 'contrato_churn_date',
       'first_activity_date'], axis=1)

    return df_time_features


In [9]:
df2 = create_time_features(df,advertisers)
df2.columns

Index(['advertiser_zrive_id', 'period_int', 'monthly_published_ads',
       'monthly_unique_published_ads', 'monthly_contracted_ads',
       'monthly_leads', 'monthly_visits', 'monthly_oro_ads',
       'monthly_plata_ads', 'monthly_destacados_ads', 'monthly_pepitas_ads',
       'monthly_shows', 'monthly_total_phone_views', 'monthly_total_calls',
       'monthly_total_emails', 'monthly_total_invoice', 'monthly_unique_calls',
       'monthly_unique_emails', 'monthly_unique_leads', 'monthly_avg_ad_price',
       'monthly_distinct_ads', 'month_period', 'churn', 'tenure',
       'months_since_last_contract', 'has_renewed'],
      dtype='object')

Ejemplo

In [10]:
advertisers[advertisers.advertiser_zrive_id == 5][['advertiser_zrive_id','min_start_contrato_date','max_start_contrato_nuevo_date']]

Unnamed: 0,advertiser_zrive_id,min_start_contrato_date,max_start_contrato_nuevo_date
3234,5,2024-07-30,2024-09-30


In [11]:
df2[df2.advertiser_zrive_id == 5][['advertiser_zrive_id', 'month_period', 'churn', 'tenure', 'months_since_last_contract', 'has_renewed']]

Unnamed: 0,advertiser_zrive_id,month_period,churn,tenure,months_since_last_contract,has_renewed
52,5,2024-07,0.0,0,0.0,0
53,5,2024-08,0.0,1,1.0,0
54,5,2024-09,0.0,2,0.0,1
55,5,2024-10,1.0,3,1.0,1


RATIOS

In [19]:
def create_ratios(df):
    '''
    Create ratios between features
    '''
    
    df_features = df.copy()

    def safe_divide(numerator, denominator):
        return np.where(denominator > 0, numerator / denominator, 0)

    df_features['monthly_total_premium_ads'] = (
        df_features['monthly_oro_ads'] + 
        df_features['monthly_plata_ads'] +
        df_features['monthly_destacados_ads'] +
        df_features['monthly_pepitas_ads']
    )

    # Ads ratios
    df_features['ratio_published_contracted'] = safe_divide(
        df_features['monthly_published_ads'], df_features['monthly_contracted_ads']
    )
    df_features['ratio_unique_published'] = safe_divide(
        df_features['monthly_unique_published_ads'], df_features['monthly_published_ads']
    )
    df_features['ratio_premium_ads'] = safe_divide(
        df_features['monthly_total_premium_ads'], df_features['monthly_published_ads']
    )

    # Engagement ratios
    df_features['leads_per_published_ad'] = safe_divide(
        df_features['monthly_leads'], df_features['monthly_published_ads']
    )

    df_features['leads_per_premium_ad'] = safe_divide(
        df_features['monthly_leads'], df_features['monthly_total_premium_ads']
    )

    df_features['visits_per_published_ad'] = safe_divide(
        df_features['monthly_visits'], df_features['monthly_published_ads']
    )
    df_features['leads_per_visit'] = safe_divide(
        df_features['monthly_leads'], df_features['monthly_visits']
    )
    df_features['leads_per_shows'] = safe_divide(
        df_features['monthly_leads'], df_features['monthly_shows']
    )

    # Economic ratios
    df_features['invoice_per_published_ad'] = safe_divide(
        df_features['monthly_total_invoice'], df_features['monthly_published_ads']
    )
    df_features['invoice_per_lead'] = safe_divide(
        df_features['monthly_total_invoice'], df_features['monthly_leads']
    )
    
    # Not useful if monthly_avg_ad_price refers to car prices
    '''df_features['return_on_investment'] = safe_divide(
        df_features['monthly_total_invoice'],
        df_features['monthly_avg_ad_price'] * df_features['monthly_contracted_ads']
    )
    df_features['expenditure_per_lead'] = safe_divide(
        df_features['monthly_avg_ad_price'] * df_features['monthly_contracted_ads'],
        df_features['monthly_leads']
    )'''

    return df_features

In [20]:
df3 = create_ratios(df2)
df3.columns

Index(['advertiser_zrive_id', 'period_int', 'monthly_published_ads',
       'monthly_unique_published_ads', 'monthly_contracted_ads',
       'monthly_leads', 'monthly_visits', 'monthly_oro_ads',
       'monthly_plata_ads', 'monthly_destacados_ads', 'monthly_pepitas_ads',
       'monthly_shows', 'monthly_total_phone_views', 'monthly_total_calls',
       'monthly_total_emails', 'monthly_total_invoice', 'monthly_unique_calls',
       'monthly_unique_emails', 'monthly_unique_leads', 'monthly_avg_ad_price',
       'monthly_distinct_ads', 'month_period', 'churn', 'tenure',
       'months_since_last_contract', 'has_renewed',
       'monthly_total_premium_ads', 'ratio_published_contracted',
       'ratio_unique_published', 'ratio_premium_ads', 'leads_per_published_ad',
       'leads_per_premium_ad', 'visits_per_published_ad', 'leads_per_visit',
       'leads_per_shows', 'invoice_per_published_ad', 'invoice_per_lead'],
      dtype='object')

AGGREGATE STATS

In [14]:
def create_agg_stats(df, features, months = 3, agg_funcs=['mean', 'std', 'min', 'max'], add_deltas=True):
    '''
    Adds aggregate features over the last months for the features passed to the function 
    '''

    df_agg = df.copy().sort_values(by=['advertiser_zrive_id','month_period'], ascending=[True, True])

    for feature in features:
        for agg_func in agg_funcs:
            col_name = f'{feature}_{months}_months_{agg_func}'
            df_agg[col_name] = (
                df_agg.groupby('advertiser_zrive_id')[feature]
                .transform(lambda x: x.rolling(window=months, min_periods=1).agg(agg_func))
            )

            if add_deltas and agg_func == 'mean':
                delta_col = f'{feature}_{months}_months_mean_delta'
                df_agg[delta_col] = df_agg[feature] - df_agg[col_name]

    df_agg = df_agg.sort_values(by=['advertiser_zrive_id','month_period'])

    return df_agg


Example: add aggregate stats by advertiser id for monthly_published_ads in the last 3 months, with mean deltas

In [15]:
published_ads_agg = df[['advertiser_zrive_id','month_period','monthly_published_ads']]
feature = ['monthly_published_ads']
create_agg_stats(published_ads_agg, feature).head(30)


Unnamed: 0,advertiser_zrive_id,month_period,monthly_published_ads,monthly_published_ads_3_months_mean,monthly_published_ads_3_months_mean_delta,monthly_published_ads_3_months_std,monthly_published_ads_3_months_min,monthly_published_ads_3_months_max
0,1,2023-01,47,47.0,0.0,,47.0,47.0
3553,1,2023-02,44,45.5,-1.5,2.12132,44.0,47.0
1,2,2023-01,31,31.0,0.0,,31.0,31.0
3554,2,2023-02,31,31.0,0.0,0.0,31.0,31.0
6657,2,2023-03,47,36.333333,10.666667,9.237604,31.0,47.0
9584,2,2023-04,64,47.333333,16.666667,16.502525,31.0,64.0
12333,2,2023-05,79,63.333333,15.666667,16.010413,47.0,79.0
14979,2,2023-06,76,73.0,3.0,7.937254,64.0,79.0
17541,2,2023-07,71,75.333333,-4.333333,4.041452,71.0,79.0
20035,2,2023-08,78,75.0,3.0,3.605551,71.0,78.0


In [21]:
features_to_aggregate = ['monthly_leads', 
                        'monthly_visits',
                        'monthly_total_invoice',
                        'monthly_avg_ad_price',
                        'monthly_published_ads',
                        'monthly_contracted_ads',
                        'ratio_published_contracted',
                        'ratio_unique_published', 
                        'ratio_premium_ads', 
                        'leads_per_published_ad',
                        'leads_per_premium_ad', 
                        'visits_per_published_ad', 
                        'leads_per_visit',
                        'leads_per_shows', 
                        'invoice_per_published_ad', 
                        'invoice_per_lead'
                        ]
agg_funcs=['mean']
df4 = create_agg_stats(df3, months=3, features=features_to_aggregate, agg_funcs=agg_funcs, add_deltas=True)


In [25]:
df4.to_parquet(path / "engineered_data.parquet")
