<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Feature-tools-on-entirety" data-toc-modified-id="Feature-tools-on-entirety-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Feature tools on entirety</a></span></li><li><span><a href="#Interactions-in-test&amp;train-dfs" data-toc-modified-id="Interactions-in-test&amp;train-dfs-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Interactions in test&amp;train dfs</a></span></li><li><span><a href="#Merge-automated-&amp;-manual-aggs" data-toc-modified-id="Merge-automated-&amp;-manual-aggs-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Merge automated &amp; manual aggs</a></span></li></ul></div>

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 4
%autosave 120

Autosaving every 120 seconds


In [2]:
from fastai.io import *
from fastai.structured import *
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from pandas_summary import DataFrameSummary
from IPython.display import display
from sklearn import metrics
import feather
import gc
import featuretools as ft

In [3]:
hist_trans = feather.read_dataframe('hist_trans_beta')
new_hist_trans = feather.read_dataframe('new_hist_trans_beta')

In [4]:
def additional_feats(hist_df):
#     hist_df['price'] = hist_df['purchase_amount'] / hist_df['installments']
    hist_df['duration'] = hist_df['purchase_amount']*hist_df['month_diff']
    hist_df['amount_month_ratio'] = hist_df['purchase_amount']/hist_df['month_diff']
    return hist_df

In [5]:
dfs = [hist_trans, new_hist_trans]

In [6]:
hist_trans, new_hist_trans = [additional_feats(df) for df in dfs]

In [7]:
PATH = 'data/elo/'
train, test = [pd.read_csv(f'{PATH}{c}') for c in ['train.csv', 'test.csv']]

In [8]:
temp_hist_trans = hist_trans.loc[hist_trans['card_id'].isin(['C_ID_92a2005557', 'C_ID_2223b33279', "C_ID_ae86f7d5fb"])]

In [9]:
temp_new_trans = new_hist_trans.loc[new_hist_trans['card_id'].isin(['C_ID_92a2005557', 'C_ID_2223b33279', "C_ID_ae86f7d5fb"])]

In [10]:
for df in [temp_hist_trans, temp_new_trans]:
    df.drop(['purchase_Year', 'purchase_Month', 'purchase_Week', 'purchase_Day',
       'purchase_Dayofweek', 'purchase_Dayofyear', 'purchase_Is_month_end', 'purchase_Is_month_start',
       'purchase_Is_quarter_end', 'purchase_Is_quarter_start', 'purchase_Is_year_end',
       'purchase_Is_year_start', 'purchase_Elapsed'], axis=1, inplace=True)

In [11]:
temp_hist_trans.shape, temp_new_trans.shape

((1028, 20), (68, 20))

In [12]:
temp_new_trans.columns.values

array(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments', 'category_3',
       'merchant_category_id', 'merchant_id', 'month_lag', 'purchase_amount', 'purchase_date', 'category_2',
       'state_id', 'subsector_id', 'purchased_on_weekend', 'purchased_on_weekday', 'month_diff',
       'purchase_date_successive_diff', 'duration', 'amount_month_ratio'], dtype=object)

In [13]:
cat_cols = ['authorized_flag', 'card_id', 'city_id', 'category_1', 'category_3',
       'merchant_category_id', 'merchant_id', 'category_2',
       'state_id', 'subsector_id', 'purchased_on_weekend']

In [14]:
cat_dict = {c:ft.variable_types.Categorical for c in cat_cols}

In [15]:
cat_dict['purchase_date'] = ft.variable_types.Datetime

In [16]:
cat_dict

{'authorized_flag': featuretools.variable_types.variable.Categorical,
 'card_id': featuretools.variable_types.variable.Categorical,
 'city_id': featuretools.variable_types.variable.Categorical,
 'category_1': featuretools.variable_types.variable.Categorical,
 'category_3': featuretools.variable_types.variable.Categorical,
 'merchant_category_id': featuretools.variable_types.variable.Categorical,
 'merchant_id': featuretools.variable_types.variable.Categorical,
 'category_2': featuretools.variable_types.variable.Categorical,
 'state_id': featuretools.variable_types.variable.Categorical,
 'subsector_id': featuretools.variable_types.variable.Categorical,
 'purchased_on_weekend': featuretools.variable_types.variable.Categorical,
 'purchase_date': featuretools.variable_types.variable.Datetime}

### Feature tools on entirety

In [17]:
es = ft.EntitySet()

In [18]:
temp_es = es.entity_from_dataframe('temp_hist_trans', temp_hist_trans, 'index', time_index='purchase_date',
                                        variable_types=cat_dict)
temp_es = es.entity_from_dataframe('temp_new_trans', temp_new_trans, 'index', time_index='purchase_date',
                                        variable_types=cat_dict)




In [19]:
temp_es

Entityset: None
  Entities:
    temp_hist_trans [Rows: 1028, Columns: 21]
    temp_new_trans [Rows: 68, Columns: 21]
  Relationships:
    No relationships

In [20]:
temp_es['temp_new_trans']

Entity: temp_new_trans
  Variables:
    index (dtype: index)
    installments (dtype: numeric)
    month_lag (dtype: numeric)
    purchase_amount (dtype: numeric)
    purchased_on_weekday (dtype: numeric)
    month_diff (dtype: numeric)
    purchase_date_successive_diff (dtype: numeric)
    duration (dtype: numeric)
    amount_month_ratio (dtype: numeric)
    authorized_flag (dtype: categorical)
    card_id (dtype: categorical)
    city_id (dtype: categorical)
    category_1 (dtype: categorical)
    category_3 (dtype: categorical)
    merchant_category_id (dtype: categorical)
    merchant_id (dtype: categorical)
    category_2 (dtype: categorical)
    state_id (dtype: categorical)
    subsector_id (dtype: categorical)
    purchased_on_weekend (dtype: categorical)
    purchase_date (dtype: datetime_time_index)
  Shape:
    (Rows: 68, Columns: 21)

In [21]:
temp_train = train[train['card_id'].isin(['C_ID_92a2005557', 'C_ID_2223b33279', "C_ID_ae86f7d5fb"])]

In [22]:
temp_test = test[test['card_id'].isin(['C_ID_92a2005557', 'C_ID_2223b33279', "C_ID_ae86f7d5fb"])]

In [23]:
temp_train.shape, temp_test.shape

((2, 6), (1, 5))

In [24]:
temp_train

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
91918,2017-06,C_ID_ae86f7d5fb,5,2,1,-0.563362


In [25]:
temp_test

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
116681,2016-12,C_ID_2223b33279,3,2,1


In [26]:
train_cat_cols = ['feature_1', 'feature_2', 'feature_3']

In [27]:
train_cat_cols = {c:ft.variable_types.Ordinal for c in train_cat_cols}

In [28]:
temp_es = es.entity_from_dataframe('train', temp_train, 'card_id', time_index='first_active_month',
                                        variable_types=train_cat_cols)

In [29]:
temp_es['train']

Entity: train
  Variables:
    card_id (dtype: index)
    first_active_month (dtype: datetime_time_index)
    target (dtype: numeric)
    feature_1 (dtype: ordinal)
    feature_2 (dtype: ordinal)
    feature_3 (dtype: ordinal)
  Shape:
    (Rows: 2, Columns: 6)

In [30]:
temp_es = es.entity_from_dataframe('test', temp_train, 'card_id', time_index='first_active_month',
                                        variable_types=train_cat_cols)

In [31]:
new_r1 = ft.Relationship( temp_es['train']['card_id'],temp_es['temp_hist_trans']['card_id'])
new_r2 = ft.Relationship( temp_es['train']['card_id'],temp_es['temp_new_trans']['card_id'])
new_r3 = ft.Relationship( temp_es['test']['card_id'],temp_es['temp_hist_trans']['card_id'])
new_r4 = ft.Relationship( temp_es['test']['card_id'],temp_es['temp_new_trans']['card_id'])

In [32]:
for r in [new_r1, new_r2, new_r3, new_r4]:
    
    temp_es = temp_es.add_relationship(r)

In [33]:
temp_es

Entityset: None
  Entities:
    temp_hist_trans [Rows: 1028, Columns: 21]
    temp_new_trans [Rows: 68, Columns: 21]
    train [Rows: 2, Columns: 6]
    test [Rows: 2, Columns: 6]
  Relationships:
    temp_hist_trans.card_id -> train.card_id
    temp_new_trans.card_id -> train.card_id
    temp_hist_trans.card_id -> test.card_id
    temp_new_trans.card_id -> test.card_id

In [34]:
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "median",
                           "percent_true", "num_unique", "mode", "avg_time_between", "time_since_last",
                           "percent_true", "trend"]
default_trans_primitives =  ["day", "month","weekday","days_since", "percentile", "hour"]

In [35]:
temp_es['temp_hist_trans']['authorized_flag'].interesting_values = [1]
temp_es['temp_hist_trans']['purchased_on_weekend'].interesting_values = [1]
temp_es['temp_new_trans']['purchased_on_weekend'].interesting_values = [1]

In [36]:
feature_matrix, feature_names = ft.dfs(entityset = temp_es, target_entity = 'train',
                                       trans_primitives = default_trans_primitives,
                                       agg_primitives=default_agg_primitives,
                                       ignore_entities=['test'],
                                       where_primitives = ['mean', 'sum'],
                                        max_depth = 2, features_only=False, verbose = True)

pd.options.display.max_columns = 1700
pd.options.display.max_rows = 1700

Built 601 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 1/1 chunks


In [37]:
feature_matrix.T

card_id,C_ID_92a2005557,C_ID_ae86f7d5fb
target,-0.820283,-0.563362
feature_1,5,5
feature_2,2,2
feature_3,1,1
SUM(temp_hist_trans.installments),4,4
SUM(temp_hist_trans.month_lag),-1017,-308
SUM(temp_hist_trans.purchase_amount),-165.969,-74.8775
SUM(temp_hist_trans.purchased_on_weekday),170,84
SUM(temp_hist_trans.month_diff),2951,1933
SUM(temp_hist_trans.purchase_date_successive_diff),2.0978e+07,1.45427e+07


In [39]:
feature_matrix.target

card_id
C_ID_92a2005557   -0.820283
C_ID_ae86f7d5fb   -0.563362
Name: target, dtype: float64

In [37]:
def additional_feats2(hist_df):
    hist_df['price'] = hist_df['purchase_amount'] / hist_df['installments']
    #Christmas : December 25 2017
    hist_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Mothers Day: May 14 2017
    hist_df['Mothers_Day_2017']=(pd.to_datetime('2017-06-04')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #fathers day: August 13 2017
    hist_df['fathers_day_2017']=(pd.to_datetime('2017-08-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Childrens day: October 12 2017
    hist_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Valentine's Day : 12th June, 2017
    hist_df['Valentine_Day_2017']=(pd.to_datetime('2017-06-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Black Friday : 24th November 2017
    hist_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #2018
    #Mothers Day: May 13 2018
    hist_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    return hist_df

In [38]:
temp_hist_trans.head()

Unnamed: 0,index,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,purchased_on_weekend,purchased_on_weekday,month_diff,purchase_date_successive_diff,duration,amount_month_ratio
9935,0,1,C_ID_2223b33279,344,0,0.0,0,415,M_ID_d5b5d05087,-10,-0.716855,2017-01-28 19:45:48,2.0,18,18,1,0,14,17478.0,-10.035967,-0.051204
9936,1,1,C_ID_2223b33279,344,0,0.0,0,574,M_ID_59ddc41791,-8,-0.731881,2017-03-16 16:42:42,2.0,18,22,0,1,14,87040.0,-10.246338,-0.052277
9937,2,1,C_ID_2223b33279,101,0,0.0,0,307,M_ID_54718748ed,-4,-0.626696,2017-07-18 10:06:57,4.0,4,19,0,1,14,1470.0,-8.77374,-0.044764
9938,3,1,C_ID_2223b33279,107,0,0.0,0,307,M_ID_5b87a162da,-4,-0.596643,2017-07-26 19:16:39,4.0,4,19,0,1,14,2686.0,-8.352998,-0.042617
9939,4,1,C_ID_2223b33279,344,0,0.0,0,574,M_ID_59ddc41791,-2,-0.496416,2017-09-19 16:39:37,2.0,18,22,0,1,14,247222.0,-6.949822,-0.035458


In [39]:
hist_trans2, new_hist_trans2 = [additional_feats2(df) for df in [temp_hist_trans, temp_new_trans]]

In [40]:
dfs2 = [hist_trans2, new_hist_trans2]

In [41]:
def aggregate_by_card_id(df):
    aggs = {}
    aggs['Christmas_Day_2017'] = ['mean']
    aggs['Mothers_Day_2017'] = ['mean']
    aggs['fathers_day_2017'] = ['mean']
    aggs['Children_day_2017'] = ['mean']
    aggs['Valentine_Day_2017'] = ['mean']
    aggs['Black_Friday_2017'] = ['mean']
    aggs['Mothers_Day_2018'] = ['mean']
    aggs['price'] = ['sum','mean','max','min','var', 'skew']
    aggs['purchase_date'] = ['max','min']
    aggs['card_id'] = ['size']
    aggs['category_1'] = ['sum', 'mean']
    aggs['category_2'] = ['mean']
    aggs['category_3'] = ['mean']
    new_df = df.groupby(['card_id']).agg(aggs)
    new_df.columns = ['_'.join(col).strip() for col in new_df.columns.values]
    new_df.reset_index(inplace=True)
    for col in ['category_2','category_3', 'subsector_id', 'state_id']:
        df[col+'_mean'] = df.groupby([col])['purchase_amount'].transform('mean')
        df[col+'_min'] = df.groupby([col])['purchase_amount'].transform('min')
        df[col+'_max'] = df.groupby([col])['purchase_amount'].transform('max')
        df[col+'_sum'] = df.groupby([col])['purchase_amount'].transform('sum')
        aggs[col+'_mean'] = ['mean']

    other_df = (df.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    new_df = pd.merge(other_df, new_df, on='card_id', how='left')

    new_df['purchase_date_diff'] = (new_df['purchase_date_max'] - new_df['purchase_date_min']).dt.days
    new_df['purchase_date_average'] = new_df['purchase_date_diff']/new_df['card_id_size']
    new_df['purchase_date_uptonow'] = (datetime.datetime.today() - new_df['purchase_date_max']).dt.days
    new_df['purchase_date_uptomin'] = (datetime.datetime.today() - new_df['purchase_date_min']).dt.days
    new_df['inverse_avg_transactions_per_day'] = new_df['purchase_date_diff']/new_df['card_id_size']
    new_df['days_since_last_transaction'] = (datetime.datetime.today() - new_df['purchase_date_max']).dt.days
    return new_df

In [42]:
gc.collect()

14

In [43]:
%time hist_trans_agg, new_hist_trans_agg = [aggregate_by_card_id(df) for df in dfs2]

CPU times: user 72.9 ms, sys: 0 ns, total: 72.9 ms
Wall time: 73.2 ms


In [44]:
hist_trans_agg.shape, new_hist_trans_agg.shape

((3, 28), (3, 28))

In [45]:
feature_matrix.shape

(2, 601)

In [46]:
new_hist_trans_agg

Unnamed: 0,card_id,transactions_count,Christmas_Day_2017_mean,Mothers_Day_2017_mean,fathers_day_2017_mean,Children_day_2017_mean,Valentine_Day_2017_mean,Black_Friday_2017_mean,Mothers_Day_2018_mean,price_sum,price_mean,price_max,price_min,price_var,price_skew,purchase_date_max,purchase_date_min,card_id_size,category_1_sum,category_1_mean,category_2_mean,category_3_mean,purchase_date_diff,purchase_date_average,purchase_date_uptonow,purchase_date_uptomin,inverse_avg_transactions_per_day,days_since_last_transaction
0,C_ID_2223b33279,21,12.761905,0,0,0,0,0,0.0,-inf,-inf,-inf,-inf,,,2018-01-25 12:23:25,2017-12-01 12:03:54,21,0,0,1.809524,0,55,2.619048,377,432,2.619048,377
1,C_ID_92a2005557,23,0.0,0,0,0,0,0,41.73913,-inf,-inf,-inf,-inf,,,2018-04-29 11:23:05,2018-03-05 14:04:36,23,0,0,1.0,0,54,2.347826,283,338,2.347826,283
2,C_ID_ae86f7d5fb,24,4.25,0,0,0,0,0,0.0,,,inf,-inf,,,2018-01-26 10:02:57,2017-12-02 16:32:02,24,0,0,2.833333,0,54,2.25,376,431,2.25,376


In [47]:
def add_extra_cols_on_agg(df):
    df['repurchase_merchant_rate_hist'] = df['COUNT(temp_hist_trans)']/df['NUM_UNIQUE(temp_hist_trans.merchant_id)']
    df['repurchase_merchant_rate_new'] = df['COUNT(temp_new_trans)']/df['NUM_UNIQUE(temp_new_trans.merchant_id)']

    df['merchant_category_repurchase_new'] = df['NUM_UNIQUE(temp_new_trans.merchant_category_id)']/df['NUM_UNIQUE(temp_new_trans.merchant_id)']
    df['merchant_category_repurchase_hist'] = df['NUM_UNIQUE(temp_hist_trans.merchant_category_id)']/df['NUM_UNIQUE(temp_hist_trans.merchant_id)']

    df['avg_spend_per_merchant_hist'] = df['SUM(temp_hist_trans.purchase_amount)']/df['NUM_UNIQUE(temp_hist_trans.merchant_id)']
    df['avg_spend_per_merchant_new'] = df['SUM(temp_new_trans.purchase_amount)']/df['NUM_UNIQUE(temp_new_trans.merchant_id)']

    df['avg_trans_per_merchant_new'] = df['COUNT(temp_new_trans)']/df['NUM_UNIQUE(temp_new_trans.merchant_id)']
    df['avg_trans_per_merchant_hist'] = df['COUNT(temp_hist_trans)']/df['NUM_UNIQUE(temp_hist_trans.merchant_id)']

    df['avg_spend_per_transaction_hist'] = df['SUM(temp_hist_trans.purchase_amount)']/df['COUNT(temp_hist_trans)']
    df['avg_spend_per_transaction_new'] = df['SUM(temp_new_trans.purchase_amount)']/df['COUNT(temp_new_trans)']
    return df

In [48]:
df_fm = add_extra_cols_on_agg(feature_matrix)

In [49]:
df_fm.shape

(2, 611)

### Interactions in test&train dfs

In [4]:
l_es = ft.EntitySet()

In [5]:
 test['feature_3'] = test['feature_3'].map({0:1, 1:2})

In [6]:
test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,2
1,2017-01,C_ID_130fd0cbdd,2,3,1
2,2017-08,C_ID_b709037bc5,5,1,2
3,2017-12,C_ID_d27d835a9f,2,1,1
4,2015-12,C_ID_2b5e3df5c2,5,1,2


In [10]:
# lala_cols = {}
# lala_cols['feature_3'] = ft.variable_types.variable.Boolean

In [11]:
# lala_cols

{'feature_3': featuretools.variable_types.variable.Boolean}

In [27]:
lala_es = l_es.entity_from_dataframe('train', temp_train, 'card_id', time_index='first_active_month')

NameError: name 'temp_train' is not defined

In [101]:
test.shape

(123623, 5)

In [7]:
lala_es = l_es.entity_from_dataframe('test', test, 'card_id', time_index='first_active_month')

In [8]:
lala_es['test']

Entity: test
  Variables:
    card_id (dtype: index)
    first_active_month (dtype: datetime_time_index)
    feature_1 (dtype: numeric)
    feature_2 (dtype: numeric)
    feature_3 (dtype: numeric)
  Shape:
    (Rows: 123623, Columns: 5)

In [22]:
# lala_es['test']['feature_3'].interesting_values = [1]

In [11]:
lala_feature_matrix, lala_feature_names = ft.dfs(entityset = lala_es, target_entity = 'test',
                                       trans_primitives = default_trans_primitives,
                                       agg_primitives=default_agg_primitives,
                                       where_primitives = ['mean', 'sum'],
                                        max_depth = 3, features_only=False, verbose = True)


Built 1426 features
Elapsed: 00:18 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [31]:
DataFrameSummary(test).summary()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
count,,,123623,123623,123623
mean,,,3.10926,1.7418,1.56438
std,,,1.18911,0.749195,0.49584
min,,,1,1,1
25%,,,2,1,1
50%,,,3,2,2
75%,,,4,2,2
max,,,5,3,2
counts,123622,123623,123623,123623,123623
uniques,75,123623,5,3,2


In [32]:
lala_feature_matrix.head()

Unnamed: 0_level_0,feature_1,feature_2,feature_3,DAYS_SINCE(first_active_month),feature_1 + feature_3,feature_1 + feature_2,feature_2 + feature_3,feature_3 % feature_1,feature_3 % feature_2,feature_1 % feature_3,...,feature_1 % feature_2 + feature_3 * feature_2 + feature_3 % feature_1 + feature_3,feature_2 + feature_3 % feature_1 + feature_3 * feature_3 % feature_1 + feature_3,DAYS_SINCE(first_active_month) % feature_1 * feature_3 % DAYS_SINCE(first_active_month),DAYS_SINCE(first_active_month) % feature_1 + feature_3 * feature_2 + feature_3 % feature_1 + feature_3,feature_1 % DAYS_SINCE(first_active_month) * feature_1 % feature_2,feature_2 + feature_3 % feature_1 + feature_3 * feature_3,feature_2 % feature_2 + feature_3 * feature_3 % feature_2 + feature_3,feature_2 % feature_1 + feature_2 * feature_3 % DAYS_SINCE(first_active_month),feature_1 % feature_1 + feature_3 * feature_2 + feature_3 % feature_1 + feature_3,DAYS_SINCE(first_active_month) % feature_1 + feature_2 * feature_3 % feature_1 + feature_3
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_0001238066,4.0,1.0,1.0,522.0,5.0,5.0,2.0,1.0,0.0,0.0,...,0.0,2.0,2.0,4.0,0.0,2.0,1.0,1.0,8.0,2.0
C_ID_0001793786,5.0,2.0,2.0,765.0,7.0,7.0,4.0,2.0,0.0,1.0,...,4.0,8.0,0.0,8.0,5.0,8.0,4.0,4.0,20.0,4.0
C_ID_00024e244b,2.0,2.0,1.0,918.0,3.0,4.0,3.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0
C_ID_0002709b5a,5.0,1.0,2.0,1284.0,7.0,6.0,3.0,2.0,0.0,1.0,...,6.0,6.0,8.0,9.0,0.0,6.0,2.0,2.0,15.0,0.0
C_ID_000298032a,2.0,2.0,1.0,796.0,3.0,4.0,3.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0


In [10]:
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "median",
                           "percent_true", "num_unique", "mode", "avg_time_between", "time_since_last",
                           "percent_true", "trend"]
default_trans_primitives =  ["days_since", "add", "mod", "multiply"]

In [86]:
ft.list_primitives()

Unnamed: 0,name,type,description
0,all,aggregation,Test if all values are 'True'.
1,any,aggregation,Test if any value is 'True'.
2,mode,aggregation,Finds the most common element in a categorical...
3,last,aggregation,Returns the last value.
4,mean,aggregation,Computes the average value of a numeric feature.
5,std,aggregation,Finds the standard deviation of a numeric feat...
6,num_true,aggregation,Finds the number of 'True' values in a boolean.
7,sum,aggregation,Sums elements of a numeric or boolean feature.
8,skew,aggregation,Computes the skewness of a data set.
9,avg_time_between,aggregation,Computes the average time between consecutive ...


In [33]:
from featuretools import selection

# Remove features with only one unique value
feature_matrix2 = selection.remove_low_information_features(lala_feature_matrix)

print('Removed %d features' % (lala_feature_matrix.shape[1]- feature_matrix2.shape[1]))


Removed 0 features


In [34]:
feature_matrix2.shape

(123623, 1426)

In [35]:
df_train_columns = lala_feature_matrix.columns.values

In [37]:
from scipy.stats import ks_2samp
list_p_value =[]

for i in tqdm(df_train_columns):
    list_p_value.append(ks_2samp(lala_feature_matrix[i] , lala_feature_matrix[i])[1])

Se = pd.Series(list_p_value, index = df_train_columns).sort_values() 
list_discarded = list(Se[Se < .1].index)

100%|██████████| 1426/1426 [00:23<00:00, 59.95it/s]


In [38]:
list_discarded

[]

In [12]:
correlation = lala_feature_matrix.corr()

In [17]:
correlation.to_feather('correlation')

ValueError: feather does not support serializing <class 'pandas.core.indexes.base.Index'> for the index; you can .reset_index()to make the index into column(s)

In [14]:
correlation.head()

Unnamed: 0,feature_1,feature_2,feature_3,DAYS_SINCE(first_active_month),feature_2 + feature_3,feature_1 + feature_2,feature_1 + feature_3,feature_1 % feature_3,feature_1 % feature_2,feature_3 % feature_2,...,feature_1 + feature_2 % feature_1 + feature_3 * feature_2 % feature_2 + feature_3,feature_1 + feature_2 % feature_3 * feature_2 + feature_3 % feature_1 + feature_2,feature_2 + feature_3 % feature_1 + feature_2 * feature_3,feature_1 + feature_2 % feature_1 * feature_3 % feature_2 + feature_3,DAYS_SINCE(first_active_month) % feature_3 * feature_2 + feature_3 % feature_1 + feature_2,feature_1 % feature_1 + feature_2 * feature_1 + feature_2 % feature_1,feature_1 + feature_2 % feature_2 + feature_3 * feature_1 + feature_3 % feature_1,feature_1 + feature_2 % feature_2 + feature_3 * feature_2 % feature_3,feature_2 + feature_3 % feature_1 + feature_3 * feature_2 + feature_3 % feature_2,feature_2 % feature_1 + feature_3 * feature_2 + feature_3
feature_1,1.0,-0.132061,0.58279,0.113843,0.20543,0.826459,0.964794,0.58279,0.038619,-0.250381,...,0.339106,0.291089,0.487026,0.565775,0.262945,0.731561,0.341292,-0.051689,0.097805,0.137869
feature_2,-0.132061,1.0,0.065344,0.155899,0.844932,0.448923,-0.081353,0.065344,0.448912,0.816185,...,0.018942,0.173283,0.447172,-0.136113,0.162768,-0.088647,0.289003,0.196103,0.282098,0.759679
feature_3,0.58279,0.065344,1.0,0.195187,0.588942,0.562479,0.775998,1.0,0.094626,-0.078709,...,0.288448,0.441731,0.88361,0.52947,0.519518,0.330609,0.413546,0.494299,-0.328349,0.423931
DAYS_SINCE(first_active_month),0.113843,0.155899,0.195187,1.0,0.230888,0.191172,0.151535,0.195187,-0.194043,0.279974,...,0.188276,-0.10514,0.243634,-0.140469,0.141321,-0.123425,-0.035912,0.219171,-0.053573,0.270002
feature_2 + feature_3,0.20543,0.844932,0.588942,0.230888,1.0,0.665085,0.350062,0.588942,0.414298,0.618844,...,0.169955,0.377119,0.8358,0.173567,0.410299,0.105418,0.455734,0.423779,0.052471,0.842504


In [15]:
correlation.index.name = 'Variable'

In [16]:
correlation.head()

Unnamed: 0_level_0,feature_1,feature_2,feature_3,DAYS_SINCE(first_active_month),feature_2 + feature_3,feature_1 + feature_2,feature_1 + feature_3,feature_1 % feature_3,feature_1 % feature_2,feature_3 % feature_2,...,feature_1 + feature_2 % feature_1 + feature_3 * feature_2 % feature_2 + feature_3,feature_1 + feature_2 % feature_3 * feature_2 + feature_3 % feature_1 + feature_2,feature_2 + feature_3 % feature_1 + feature_2 * feature_3,feature_1 + feature_2 % feature_1 * feature_3 % feature_2 + feature_3,DAYS_SINCE(first_active_month) % feature_3 * feature_2 + feature_3 % feature_1 + feature_2,feature_1 % feature_1 + feature_2 * feature_1 + feature_2 % feature_1,feature_1 + feature_2 % feature_2 + feature_3 * feature_1 + feature_3 % feature_1,feature_1 + feature_2 % feature_2 + feature_3 * feature_2 % feature_3,feature_2 + feature_3 % feature_1 + feature_3 * feature_2 + feature_3 % feature_2,feature_2 % feature_1 + feature_3 * feature_2 + feature_3
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
feature_1,1.0,-0.132061,0.58279,0.113843,0.20543,0.826459,0.964794,0.58279,0.038619,-0.250381,...,0.339106,0.291089,0.487026,0.565775,0.262945,0.731561,0.341292,-0.051689,0.097805,0.137869
feature_2,-0.132061,1.0,0.065344,0.155899,0.844932,0.448923,-0.081353,0.065344,0.448912,0.816185,...,0.018942,0.173283,0.447172,-0.136113,0.162768,-0.088647,0.289003,0.196103,0.282098,0.759679
feature_3,0.58279,0.065344,1.0,0.195187,0.588942,0.562479,0.775998,1.0,0.094626,-0.078709,...,0.288448,0.441731,0.88361,0.52947,0.519518,0.330609,0.413546,0.494299,-0.328349,0.423931
DAYS_SINCE(first_active_month),0.113843,0.155899,0.195187,1.0,0.230888,0.191172,0.151535,0.195187,-0.194043,0.279974,...,0.188276,-0.10514,0.243634,-0.140469,0.141321,-0.123425,-0.035912,0.219171,-0.053573,0.270002
feature_2 + feature_3,0.20543,0.844932,0.588942,0.230888,1.0,0.665085,0.350062,0.588942,0.414298,0.618844,...,0.169955,0.377119,0.8358,0.173567,0.410299,0.105418,0.455734,0.423779,0.052471,0.842504


In [19]:
correlation.reset_index().to_feather('correlation')

In [20]:
threshold = 0.9

correlated_pairs = {}

# Iterate through the columns
for col in correlation:
    # Find correlations above the threshold
    above_threshold_vars = [x for x in list(correlation.index[correlation[col] > threshold]) if x != col]
    correlated_pairs[col] = above_threshold_vars

In [31]:
correlation['feature_1'].sort_values(ascending=True)[:100]

Variable
DAYS_SINCE(first_active_month) + feature_1 % DAYS_SINCE(first_active_month) + feature_2         -0.322750
DAYS_SINCE(first_active_month) + feature_3 % DAYS_SINCE(first_active_month) + feature_2         -0.280467
feature_3 % feature_2                                                                           -0.250381
feature_2 + feature_3 % feature_2                                                               -0.250381
feature_1 % feature_2 * feature_3 % feature_2                                                   -0.236594
feature_1 + feature_2 % feature_2 * feature_3 % feature_2                                       -0.236594
feature_1 + feature_2 % feature_2 * feature_2 + feature_3 % feature_2                           -0.236594
feature_1 % feature_2 * feature_2 + feature_3 % feature_2                                       -0.236594
feature_1 + feature_2 % feature_1 + feature_3 * feature_3 % feature_2                           -0.220574
feature_1 + feature_2 % feature_1 + f

In [33]:
corr_matrix = correlation.abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [35]:
len(to_drop)

1189

In [36]:
to_drop

['feature_1 + feature_3',
 'feature_1 % feature_3',
 'feature_1 * feature_3',
 'feature_2 * feature_3',
 'DAYS_SINCE(first_active_month) + feature_1',
 'DAYS_SINCE(first_active_month) + feature_3',
 'DAYS_SINCE(first_active_month) + feature_2',
 'feature_2 + feature_3 % feature_2',
 'feature_1 + feature_2 % feature_2',
 'feature_3 % feature_2 + feature_3',
 'feature_1 % feature_1 + feature_2',
 'feature_1 % feature_1 + feature_3',
 'feature_3 % feature_1 + feature_2',
 'feature_1 + feature_2 % DAYS_SINCE(first_active_month)',
 'feature_2 % feature_1 + feature_2',
 'feature_1 + feature_3 % feature_3',
 'feature_2 % feature_2 + feature_3',
 'feature_1 + feature_2 % feature_1',
 'feature_1 % DAYS_SINCE(first_active_month)',
 'feature_2 + feature_3 % feature_3',
 'feature_3 % feature_1 + feature_3',
 'feature_3 % DAYS_SINCE(first_active_month)',
 'feature_1 + feature_3 % feature_1',
 'feature_1 + feature_3 % DAYS_SINCE(first_active_month)',
 'feature_2 + feature_3 % DAYS_SINCE(first_active

In [37]:
pruned = lala_feature_matrix.drop(to_drop, axis=1)

In [38]:
pruned.shape

(123623, 237)

In [39]:
pruned.head()

Unnamed: 0_level_0,feature_1,feature_2,feature_3,DAYS_SINCE(first_active_month),feature_2 + feature_3,feature_1 + feature_2,feature_1 % feature_2,feature_3 % feature_2,feature_3 % feature_1,feature_2 % feature_1,...,DAYS_SINCE(first_active_month) % feature_1 * feature_1 + feature_2 % feature_2 + feature_3,feature_1 + feature_3 % feature_1 + feature_2 * feature_2 + feature_3 % feature_1,DAYS_SINCE(first_active_month) % feature_1 * feature_2 + feature_3 % feature_1,DAYS_SINCE(first_active_month) % feature_2 + feature_3 * feature_1 % feature_2 + feature_3,feature_1 + feature_2 % feature_3 * feature_1 + feature_3 % feature_1 + feature_2,DAYS_SINCE(first_active_month) % feature_2 * DAYS_SINCE(first_active_month) % feature_3,feature_1 + feature_2 % feature_1 + feature_3 * feature_1 + feature_3 % feature_1 + feature_2,DAYS_SINCE(first_active_month) % feature_2 + feature_3 * feature_2 + feature_3 % feature_1,DAYS_SINCE(first_active_month) % feature_1 * feature_1 + feature_2 % feature_3,feature_1 + feature_2 % feature_1 + feature_3 * feature_2 % feature_1 + feature_3
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_0001238066,4.0,1.0,1.0,522.0,2.0,5.0,0.0,0.0,1.0,1.0,...,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_ID_0001793786,5.0,2.0,2.0,765.0,4.0,7.0,1.0,0.0,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0,0.0
C_ID_00024e244b,2.0,2.0,1.0,918.0,3.0,4.0,0.0,1.0,1.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0
C_ID_0002709b5a,5.0,1.0,2.0,1284.0,3.0,6.0,0.0,0.0,2.0,1.0,...,0.0,3.0,12.0,0.0,0.0,0.0,6.0,0.0,0.0,6.0
C_ID_000298032a,2.0,2.0,1.0,796.0,3.0,4.0,0.0,1.0,1.0,0.0,...,0.0,3.0,0.0,2.0,0.0,0.0,3.0,1.0,0.0,2.0


### Merge automated & manual aggs

In [50]:
add_datepart(temp_train, 'first_active_month', drop=False)
add_datepart(temp_test, 'first_active_month', drop=False)

In [51]:
new_hist_trans_agg.columns

Index(['card_id', 'transactions_count', 'Christmas_Day_2017_mean',
       'Mothers_Day_2017_mean', 'fathers_day_2017_mean',
       'Children_day_2017_mean', 'Valentine_Day_2017_mean',
       'Black_Friday_2017_mean', 'Mothers_Day_2018_mean', 'price_sum',
       'price_mean', 'price_max', 'price_min', 'price_var', 'price_skew',
       'purchase_date_max', 'purchase_date_min', 'card_id_size',
       'category_1_sum', 'category_1_mean', 'category_2_mean',
       'category_3_mean', 'purchase_date_diff', 'purchase_date_average',
       'purchase_date_uptonow', 'purchase_date_uptomin',
       'inverse_avg_transactions_per_day', 'days_since_last_transaction'],
      dtype='object')

In [52]:
def add_additional_aggs3(df):
    df['purchase_amount_ptp_new'] = df['MAX(temp_new_trans.purchase_amount)'] - df['MIN(temp_new_trans.purchase_amount)']
    df['purchase_amount_ptp_hist'] = df['MAX(temp_hist_trans.purchase_amount)'] - df['MIN(temp_hist_trans.purchase_amount)']
    df['card_id_total'] = df['COUNT(temp_hist_trans)'] + df['COUNT(temp_new_trans)']
    df['card_id_count_ratio'] = df['COUNT(temp_hist_trans)'] / df['COUNT(temp_new_trans)']
    df['purchase_amount_total'] = df['SUM(temp_new_trans.purchase_amount)']+df['SUM(temp_hist_trans.purchase_amount)']
    df['purchase_amount_ratio'] = df['SUM(temp_hist_trans.purchase_amount)'] / df['SUM(temp_new_trans.purchase_amount)']
    df['purchase_amount_mean_total'] = df['MEAN(temp_new_trans.purchase_amount)']+df['MEAN(temp_hist_trans.purchase_amount)']
    # redo with actual min & max
    df['purchase_amount_max_total'] = df['MAX(temp_new_trans.purchase_amount)']+df['MAX(temp_hist_trans.purchase_amount)']
    df['purchase_amount_min_total'] = df['MIN(temp_new_trans.purchase_amount)']+df['MIN(temp_hist_trans.purchase_amount)']
    df['avg_spend_per_transaction'] = df['purchase_amount_total']/df['card_id_total']
    df['month_diff_mean_total'] = df['MEAN(temp_new_trans.month_diff)']+df['MEAN(temp_hist_trans.month_diff)']
    df['month_diff_mean_ratio'] = df['MEAN(temp_hist_trans.month_diff)'] / df['MEAN(temp_new_trans.month_diff)']

    df['month_lag_ratio'] = df['SUM(temp_hist_trans.month_lag)'] / df['SUM(temp_new_trans.month_lag)']
    df['month_lag_sum'] = df['SUM(temp_hist_trans.month_lag)'] + df['SUM(temp_new_trans.month_lag)']
    df['month_lag_mean_total'] = df['MEAN(temp_new_trans.month_lag)']+df['MEAN(temp_hist_trans.month_lag)']
    # redo with actual min & max
    df['month_lag_max_total'] = df['MAX(temp_new_trans.month_lag)']+df['MAX(temp_hist_trans.month_lag)']
    df['month_lag_min_total'] = df['MIN(temp_new_trans.month_lag)']+df['MIN(temp_hist_trans.month_lag)']
    
    df['installments_ratio'] = df['SUM(temp_hist_trans.installments)'] / df['SUM(temp_new_trans.installments)']
    df['installments_sum'] = df['SUM(temp_hist_trans.installments)'] + df['SUM(temp_new_trans.installments)']
    df['installments_mean_total'] = df['MEAN(temp_new_trans.installments)']+df['MEAN(temp_hist_trans.installments)']
    # redo with actual min & max
    df['installments_max_total'] = df['MAX(temp_new_trans.installments)']+df['MAX(temp_hist_trans.installments)']
    df['installments_min_total'] = df['MIN(temp_new_trans.installments)']+df['MIN(temp_hist_trans.installments)']
    
    df['duration_sum_ratio'] = df['SUM(temp_hist_trans.duration)'] / df['SUM(temp_new_trans.duration)']
    df['duration_mean_total'] = df['MEAN(temp_new_trans.duration)']+df['MEAN(temp_hist_trans.duration)']
    # redo with actual min & max
    df['duration_max_total'] = df['MAX(temp_new_trans.duration)']+df['MAX(temp_hist_trans.duration)']
    df['duration_min_total'] = df['MIN(temp_new_trans.duration)']+df['MIN(temp_hist_trans.duration)']
    
    df['amount_month_ratio_sum_ratio'] = df['SUM(temp_hist_trans.amount_month_ratio)'] / df['SUM(temp_new_trans.amount_month_ratio)']
    df['amount_month_ratio_mean_total'] = df['MEAN(temp_new_trans.amount_month_ratio)']+df['MEAN(temp_hist_trans.amount_month_ratio)']
    # redo with actual min & max
    df['amount_month_ratio_max_total'] = df['MAX(temp_new_trans.amount_month_ratio)']+df['MAX(temp_hist_trans.amount_month_ratio)']
    df['amount_month_ratio_min_total'] = df['MIN(temp_new_trans.amount_month_ratio)']+df['MIN(temp_hist_trans.amount_month_ratio)']
    
    df['category_1_mean'] = df['category_1_mean']+df['category_1_mean_old']
    df['price_total'] = df['purchase_amount_total'] / df['installments_sum']
    df['price_mean'] = df['purchase_amount_mean_total'] / df['installments_mean_total']
    df['price_max'] = df['purchase_amount_max_total'] / df['installments_max_total']

    df['CLV_new'] = df['COUNT(temp_new_trans)'] * df['SUM(temp_new_trans.purchase_amount)'] / df['MEAN(temp_new_trans.month_diff)']
    df['CLV_hist'] = df['COUNT(temp_hist_trans)'] * df['SUM(temp_hist_trans.purchase_amount)'] / df['MEAN(temp_hist_trans.month_diff)']
    
    df['CLV_ratio'] = df['CLV_hist'] / df['CLV_new']
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days

    df['hist_first_buy'] = (df['purchase_date_min_old'] - df['first_active_month']).dt.days
    df['new_first_buy'] = (df['purchase_date_min'] - df['first_active_month']).dt.days
    df['hist_last_buy'] = (df['purchase_date_max_old'] - df['first_active_month']).dt.days
    df['new_last_buy'] = (df['purchase_date_max'] - df['first_active_month']).dt.days
    df['purchased_before_issue'] = df['hist_first_buy'] < 0
    return df

In [53]:
def join_dfs(left, right, left_on, right_on=None, suffix='_old'):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, suffixes=("", suffix))

In [60]:
train_df = join_dfs(temp_train, new_hist_trans_agg, left_on='card_id')

In [61]:
train_df = join_dfs(train_df, hist_trans_agg, left_on='card_id', suffix='_old')

In [62]:
train_df = join_dfs(train_df, feature_matrix, left_on='card_id')

In [63]:
train_df.shape

(2, 684)

In [64]:
def add_days_feature_interaction(df):
    # to datetime
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])

    # datetime features
    df['quarter'] = df['first_active_month'].dt.quarter
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days

    df['days_feature1'] = df['elapsed_time'] * df['feature_1']
    df['days_feature2'] = df['elapsed_time'] * df['feature_2']
    df['days_feature3'] = df['elapsed_time'] * df['feature_3']

    df['days_feature1_ratio'] = df['feature_1'] / df['elapsed_time']
    df['days_feature2_ratio'] = df['feature_2'] / df['elapsed_time']
    df['days_feature3_ratio'] = df['feature_3'] / df['elapsed_time']
    df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
    df['feature_mean'] = df['feature_sum']/3
    df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    df['feature_var'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)
    return df

In [65]:
train_df = add_additional_aggs3(train_df)

In [66]:
train_df.shape

(2, 724)

In [67]:
train_df = add_days_feature_interaction(train_df)

In [68]:
train_df.shape

(2, 736)

In [69]:
test_df = join_dfs(temp_test, new_hist_trans_agg, left_on='card_id')
test_df = join_dfs(test_df, hist_trans_agg, left_on='card_id', suffix='_old')
test_df = join_dfs(test_df, feature_matrix, left_on='card_id')
test_df = add_additional_aggs3(test_df)
test_df = add_days_feature_interaction(test_df)

In [70]:
test_df.shape

(1, 735)

In [72]:
train_df['outliers'] = 0
train_df.loc[train_df['target'] < -30, 'outliers'] = 1
train_df['outliers'].value_counts()

0    2
Name: outliers, dtype: int64

In [73]:
for f in ['feature_1','feature_2','feature_3']:
    order_label = train_df.groupby([f])['outliers'].mean()
    train_df[f] = train_df[f].map(order_label)
    test_df[f] = test_df[f].map(order_label)