In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from operator import attrgetter

path = Path("../data")

FEATURE ENGINEERING

- add_time_features(): adds months_since_signup, months_since_new_contract, no_new_contract

- add_ratios(): adds ratios of different features

- add_agg_stats(): adds means, stds, mins, maxes. As there are many possible combinations, perhaps it is better to just make a few

As an idea, extra features could be created that compare monthly features and ratios with aggregate features of the previous months


In [None]:
advertisers = pd.read_parquet(path / "zrive_dim_advertiser.parquet")
df = pd.read_parquet(path / "processed_data.parquet")

In [3]:
df.columns

Index(['advertiser_zrive_id', 'period_int', 'monthly_published_ads',
       'monthly_unique_published_ads', 'monthly_contracted_ads',
       'monthly_leads', 'monthly_visits', 'monthly_oro_ads',
       'monthly_plata_ads', 'monthly_destacados_ads', 'monthly_pepitas_ads',
       'monthly_shows', 'monthly_total_phone_views', 'monthly_total_calls',
       'monthly_total_emails', 'monthly_total_invoice', 'monthly_unique_calls',
       'monthly_unique_emails', 'monthly_unique_leads', 'monthly_avg_ad_price',
       'monthly_distinct_ads', 'has_active_contract', 'month_period', 'churn'],
      dtype='object')

In [4]:
advertisers.columns

Index(['advertiser_zrive_id', 'province_id', 'updated_at',
       'advertiser_province', 'advertiser_group_id', 'min_start_contrato_date',
       'max_start_contrato_nuevo_date', 'contrato_churn_date'],
      dtype='object')

TIME FEATURES

In [None]:
def add_time_features(df,advertisers):
    '''
    Adds the following features to the dataset: 
    months_since_signup, months_since_new_contract, no_new_contract
    '''

    df = df.merge(advertisers, on='advertiser_zrive_id', how='left')
    
    #Convert to period in months
    df['month_period'] = pd.to_datetime(df['month_period']).dt.to_period('M')
    df['min_start_contrato_date'] = pd.to_datetime(df['min_start_contrato_date'], errors='coerce').dt.to_period('M')
    df['max_start_contrato_nuevo_date'] = pd.to_datetime(df['max_start_contrato_nuevo_date'], errors='coerce').dt.to_period('M')
    df['contrato_churn_date'] = pd.to_datetime(df['contrato_churn_date'], errors='coerce').dt.to_period('M')
    
    current_date = df['month_period']
    start_date = df['min_start_contrato_date']
    new_start_date = df['max_start_contrato_nuevo_date']
    end_date = df['contrato_churn_date']

    #Compute months_since_signup
    df['months_since_signup'] = (current_date - start_date).apply(attrgetter('n'))

    #Compute months_since_new_contract. If there is no new contract, mark as a 1 in no_new_contract column
    df['months_since_new_contract'] = (current_date - new_start_date).apply(lambda x: x.n if pd.notnull(x) else None).apply(lambda x: None if x < 0 else x)
    df['no_new_contract'] = df['months_since_new_contract'].isna().astype(int)

    #Compute months_left. If there is no end date, mark as a 1 in permanent_contract column
    #df['months_left'] = (end_date - current_date).apply(lambda x: x.n if pd.notnull(x) else None)

    df = df.drop(['period_int','month_period', 'province_id', 'updated_at',
       'advertiser_province', 'advertiser_group_id', 'min_start_contrato_date',
       'max_start_contrato_nuevo_date', 'contrato_churn_date'], axis=1)

    return df


In [21]:
df2 = add_time_features(df,advertisers)
df2.columns

Index(['advertiser_zrive_id', 'monthly_published_ads',
       'monthly_unique_published_ads', 'monthly_contracted_ads',
       'monthly_leads', 'monthly_visits', 'monthly_oro_ads',
       'monthly_plata_ads', 'monthly_destacados_ads', 'monthly_pepitas_ads',
       'monthly_shows', 'monthly_total_phone_views', 'monthly_total_calls',
       'monthly_total_emails', 'monthly_total_invoice', 'monthly_unique_calls',
       'monthly_unique_emails', 'monthly_unique_leads', 'monthly_avg_ad_price',
       'monthly_distinct_ads', 'has_active_contract', 'churn',
       'months_since_signup', 'months_since_new_contract', 'no_new_contract',
       'months_left'],
      dtype='object')

RATIOS

In [7]:
def add_ratios(df):
    '''
    Adds the following features to the dataset: 
    published_per_contracted_ads, leads_per_visit, leads_per_ad, leads_per_premium_ad, ROI
    '''

    df['published_per_contracted_ads'] = df['monthly_published_ads'] / df['monthly_contracted_ads']
    df['leads_per_visit'] = df['monthly_leads'] / df['monthly_visits']
    df['leads_per_ad'] = df['monthly_leads'] / df['monthly_published_ads']
    df['leads_per_premium_ad'] = df['monthly_leads'] / (df['monthly_oro_ads'] + df['monthly_plata_ads'] + df['monthly_destacados_ads'] + df['monthly_pepitas_ads'])
    df['ROI'] = df['monthly_total_invoice'] / (df['monthly_avg_ad_price'] * df['monthly_contracted_ads'])

    return df

In [8]:
df3 = add_ratios(df2)
df3.columns

Index(['advertiser_zrive_id', 'monthly_published_ads',
       'monthly_unique_published_ads', 'monthly_contracted_ads',
       'monthly_leads', 'monthly_visits', 'monthly_oro_ads',
       'monthly_plata_ads', 'monthly_destacados_ads', 'monthly_pepitas_ads',
       'monthly_shows', 'monthly_total_phone_views', 'monthly_total_calls',
       'monthly_total_emails', 'monthly_total_invoice', 'monthly_unique_calls',
       'monthly_unique_emails', 'monthly_unique_leads', 'monthly_avg_ad_price',
       'monthly_distinct_ads', 'has_active_contract', 'churn',
       'months_since_signup', 'months_since_new_contract', 'no_new_contract',
       'months_left', 'permanent_contract', 'published_per_contracted_ads',
       'leads_per_visit', 'leads_per_ad', 'leads_per_premium_ad', 'ROI'],
      dtype='object')

AGGREGATE STATS

In [9]:
def add_agg_stats(df, features, months = 3, agg_funcs=['mean', 'std', 'min', 'max']):
    '''
    Adds aggregate features over the last months for the features passed to the function 
    '''

    df_agg = df.copy()

    for feature in features:
        for agg_func in agg_funcs:
            col_name = f'{feature}_{months}_months_{agg_func}'
            df_agg[col_name] = (
                df_agg.groupby('advertiser_zrive_id')[feature]
                .transform(lambda x: x.rolling(window=months, min_periods=1).agg(agg_func))
            )

    
    return df_agg


In [None]:
published_ads_agg = df[['advertiser_zrive_id','month_period','monthly_published_ads']]
feature = ['monthly_published_ads']
add_agg_stats(published_ads_agg, feature).head(20)

Unnamed: 0,advertiser_zrive_id,month_period,monthly_published_ads,monthly_published_ads_3_months_mean,monthly_published_ads_3_months_std,monthly_published_ads_3_months_min,monthly_published_ads_3_months_max
0,1,2023-01,47,47.0,,47.0,47.0
1,1,2023-02,44,45.5,2.12132,44.0,47.0
2,2,2023-01,31,31.0,,31.0,31.0
3,2,2023-02,31,31.0,0.0,31.0,31.0
4,2,2023-03,47,36.333333,9.237604,31.0,47.0
5,2,2023-04,64,47.333333,16.502525,31.0,64.0
6,2,2023-05,79,63.333333,16.010413,47.0,79.0
7,2,2023-06,76,73.0,7.937254,64.0,79.0
8,2,2023-07,71,75.333333,4.041452,71.0,79.0
9,2,2023-08,78,75.0,3.605551,71.0,78.0
