In [20]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [21]:
clicks = pd.read_parquet('/Users/aryamonani/Downloads/baseline_data.pqt')

In [22]:
clicks.head(3) #the file already has some kind of encoding.
               #we will now test the encoding by splitting the data.

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,27226,3,1,13,120,2017-11-06 15:13:23,,0,6,15,13,23
1,110007,35,1,13,10,2017-11-06 15:41:07,2017-11-07 08:17:19,1,6,15,41,7
2,1047,6,1,13,157,2017-11-06 15:42:32,,0,6,15,42,32


BELOW IS A SPLIT FUNCTION TO SPLIT THE DATAFAME INTO TRAIN, VALID, TEST

In [27]:
def get_data_split(dataframe, valid_fraction=0.1):
    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    return train, valid, test

BELOW IS A TRAIN FUNCTION WHICH WILL ALSO GIVE PREDICTION SCORE

In [28]:
def train_model(train, valid, test = None, feature_cols = None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time', 'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label = train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label = valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7}
    num_round = 1000
    print('Training Model')
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds = 20, verbose_eval = False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f'Validation AUC Score: {valid_score}')
    if test is not None:
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        print(f'Test AUC Score: {test_score}')
        return bst, valid_score, test_score
    else:
        return bst, valid_score

In [29]:
print('Baseline Model')
train, valid, test = get_data_split(clicks)
_ = train_model(train, valid)

Baseline Model
Training Model
Validation AUC Score: 0.9622743228943659


COUNT ENCODING

In [33]:
import category_encoders as ce
cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_split(clicks)
count_enc = ce.CountEncoder(cols = cat_features)
count_enc.fit(train[cat_features])
train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count'))

In [34]:
_ = train_model(train_encoded, valid_encoded)

Training Model
Validation AUC Score: 0.9653051135205329


TARGET ENCODING

In [38]:
cat_features = ['app', 'device', 'os', 'channel']
train, valid, test = get_data_split(clicks)
target_enc = ce.TargetEncoder(cols = cat_features)
target_enc.fit(train[cat_features], train['is_attributed'])
train_t_encoded = train.join(target_enc.transform(train[cat_features]).add_suffix('_count'))
valid_t_encoded = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_count'))

In [39]:
_ = train_model(train_t_encoded, valid_t_encoded)

Training Model
Validation AUC Score: 0.9627457957514338


CATBOOST ENCODING

In [42]:
train, valid, test = get_data_split(clicks)
cb_enc = ce.CatBoostEncoder(cols = cat_features, random_state = 7)
cb_enc.fit(train[cat_features], train['is_attributed'])
train_cb = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_cb = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb'))

In [None]:
_ = train_model(train_cb, valid_cb)

Training Model
