# **Big Data Intelligence Project - TMALL Repeat Buyers**
### **Armando Fortes, David Pissarra, Gabriele Oliaro**

In [1]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from xgboost import XGBClassifier, DMatrix, train
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd

### Constants and Hyperparameters

In [2]:
DATA_DIR_1 = '../data_format1/'
DATA_DIR_2 = '../data_format2/'
TRAIN_PATH = DATA_DIR_1 + 'train_format1.csv'
TEST_PATH = DATA_DIR_1 + 'test_format1.csv'
USER_INFO_PATH = DATA_DIR_1 + 'user_info_format1.csv'
USER_LOG_PATH = DATA_DIR_1 + 'user_log_format1.csv'
DOUBLE11_DAY = 184

In [3]:
VALID_SET_SIZE = 0.2
PCA_COMPONENTS = 5
RANDOM_SEED = 42
EPSILON = 1e-10
SPLITS = 10

## **Data Pre-Processing**

### Read CSVs

In [4]:
df_train = pd.read_csv(TRAIN_PATH)
df_user_info = pd.read_csv(USER_INFO_PATH)
df_user_log = pd.read_csv(USER_LOG_PATH)
df_test = pd.read_csv(TEST_PATH)
df_test.drop('prob', axis=1, inplace=True)

df_train['kind'] = 'train'
df_test['kind'] = 'test'
df = df_train.append(df_test)

### Optimize memory usage and Data Cleaning

In [5]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

2.86 GB


In [6]:
df_user_log['user_id'] = df_user_log['user_id'].astype(np.int32)
df_user_log['item_id'] = df_user_log['item_id'].astype(np.int32)
df_user_log['cat_id'] = df_user_log['cat_id'].astype(np.int16)
df_user_log['seller_id'] = df_user_log['seller_id'].astype(np.int16)
df_user_log.rename(columns={'seller_id' : 'merchant_id'}, inplace=True)
df_user_log['brand_id'].fillna(0, inplace=True)
df_user_log['brand_id'] = df_user_log['brand_id'].astype(np.int16)
df_user_log['time_stamp'] = (pd.to_datetime(df_user_log['time_stamp'], format='%m%d') - pd.to_datetime(df_user_log['time_stamp'].min(), format='%m%d')).dt.days
df_user_log['time_stamp'] = df_user_log['time_stamp'].astype(np.int16)
df_user_log['action_type'] = df_user_log['action_type'].astype(np.int8)

In [7]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

0.87 GB


In [8]:
df_user_info['age_range'].fillna(0, inplace=True)
df_user_info['gender'].fillna(2, inplace=True)
df_user_info['age_range'] = df_user_info['age_range'].astype(np.int8)
df_user_info['gender'] = df_user_info['gender'].astype(np.int8)

## **Feature Engineering**

In [9]:
users = df_user_log.groupby('user_id')
merchants = df_user_log.groupby('merchant_id')
users_merchants = df_user_log.groupby(['user_id', 'merchant_id'])

double11 = (df_user_log[df_user_log['time_stamp'] == DOUBLE11_DAY]).reset_index(drop=True)
double11_users = double11.groupby('user_id')
double11_merchants = double11.groupby('merchant_id')
double11_users_merchants = double11.groupby(['user_id', 'merchant_id'])

df_user_log['time_period'] = df_user_log['time_stamp'] // 31

### General counting and ratio features

In [10]:
# transform age categorical features into different binary features
to_merge = pd.get_dummies(df_user_info, prefix='age', columns=['age_range'])
df = df.merge(to_merge, on='user_id', how='left')

# count total number of unique values from each feature for a given user 
to_merge = users.nunique().reset_index().rename(columns={
    'item_id': 'items_user', 
    'cat_id': 'categories_user',
    'merchant_id': 'merchants_user',
    'brand_id': 'brands_user',
    'time_stamp': 'dates_user',
    'time_period': 'periods_user',
    'action_type': 'action_types_user'
    })
df = df.merge(to_merge, on='user_id', how='left')

# count total number of unique values from each feature for a given merchant 
to_merge = merchants.nunique().reset_index().rename(columns={
    'item_id': 'items_merchant', 
    'cat_id': 'categories_merchant',
    'user_id': 'userss_merchant',
    'brand_id': 'brands_merchant',
    'time_stamp': 'dates_merchant',
    'time_period': 'periods_merchant',
    'action_type': 'action_types_merchant'
    })
df = df.merge(to_merge, on='merchant_id', how='left')

# count total number of unique values from each feature for a given user and merchant
to_merge = users_merchants.nunique().reset_index().rename(columns={
    'item_id': 'items_user_merchant', 
    'cat_id': 'categories_user_merchant',
    'brand_id': 'brands_user_merchant',
    'time_stamp': 'dates_user_merchant',
    'time_period': 'periods_user_merchant',
    'action_type': 'action_types_user_merchant'
    })
df = df.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

# count total actions by type for a given user
to_merge = users['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user',
    1: 'carts_user',
    2: 'purchases_user',
    3: 'favourites_user'
    })
df = df.merge(to_merge, on='user_id', how='left')

# count total actions by type for a given merchant
to_merge = merchants['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_merchant', 
    1: 'carts_merchant',
    2: 'purchases_merchant',
    3: 'favourites_merchant'
    })
df = df.merge(to_merge, on='merchant_id', how='left')

# count total actions by type for a given pair (user, merchant)
to_merge = users_merchants['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user_merchant',
    1: 'carts_user_merchant',
    2: 'purchases_user_merchant',
    3: 'favourites_user_merchant'
    })
df = df.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

# ratio of actions in each merchant (user perspective)
df['clicks_in_merchant_ratio_perspective'] = df['clicks_user_merchant'] / (df['clicks_user'] + EPSILON)
df['carts_in_merchant_ratio_perspective'] = df['carts_user_merchant'] / (df['carts_user'] + EPSILON)
df['purchases_in_merchant_ratio_perspective'] = df['purchases_user_merchant'] / (df['purchases_user'] + EPSILON)
df['favourites_in_merchant_ratio_perspective'] = df['favourites_user_merchant'] / (df['favourites_user'] + EPSILON)

# ratio of actions in each merchant (merchant perspective)
df['clicks_by_user_ratio_perspective'] = df['clicks_user_merchant'] / (df['clicks_merchant'] + EPSILON)
df['carts_by_user_ratio_perspective'] = df['carts_user_merchant'] / (df['carts_merchant'] + EPSILON)
df['purchases_by_user_ratio_perspective'] = df['purchases_user_merchant'] / (df['purchases_merchant'] + EPSILON)
df['favourites_by_user_ratio_perspective'] = df['favourites_user_merchant'] / (df['favourites_merchant'] + EPSILON)

# ratio of each action type for a given user
df['clicks_user_ratio'] = df['clicks_user'] / (df['clicks_user'] + df['carts_user'] + df['purchases_user'] + df['favourites_user'] + EPSILON)
df['carts_user_ratio'] = df['carts_user'] / (df['clicks_user'] + df['carts_user'] + df['purchases_user'] + df['favourites_user'] + EPSILON)
df['purchases_user_ratio'] = df['purchases_user'] / (df['clicks_user'] + df['carts_user'] + df['purchases_user'] + df['favourites_user'] + EPSILON)
df['favourites_user_ratio'] = df['favourites_user'] / (df['clicks_user'] + df['carts_user'] + df['purchases_user'] + df['favourites_user'] + EPSILON)

# ratio of each action type for a given merchant
df['clicks_merchant_ratio'] = df['clicks_merchant'] / (df['clicks_merchant'] + df['carts_merchant'] + df['purchases_merchant'] + df['favourites_merchant'] + EPSILON)
df['carts_merchant_ratio'] = df['carts_merchant'] / (df['clicks_merchant'] + df['carts_merchant'] + df['purchases_merchant'] + df['favourites_merchant'] + EPSILON)
df['purchases_merchant_ratio'] = df['purchases_merchant'] / (df['clicks_merchant'] + df['carts_merchant'] + df['purchases_merchant'] + df['favourites_merchant'] + EPSILON)
df['favourites_merchant_ratio'] = df['favourites_merchant'] / (df['clicks_merchant'] + df['carts_merchant'] + df['purchases_merchant'] + df['favourites_merchant'] + EPSILON)

# ratio of each action type for a given pair (user, merchant)
df['clicks_user_merchant_ratio'] = df['clicks_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)
df['carts_user_merchant_ratio'] = df['carts_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)
df['purchases_user_merchant_ratio'] = df['purchases_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)
df['favourites_user_merchant_ratio'] = df['favourites_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)

# interval features
to_merge = (users['time_stamp'].max() - users['time_stamp'].min()).rename('interval')
df = df.merge(to_merge, on='user_id', how='left')

### Monthly Features

Users

In [11]:
users_time = df_user_log.groupby(['user_id', 'time_period'])

to_merge = users_time['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user',
    1: 'carts_user',
    2: 'purchases_user',
    3: 'favourites_user'
    }).reset_index()

to_merge_aux = to_merge.groupby('user_id').max().drop('time_period', axis=1).rename(columns={
    'clicks_user': 'clicks_user_period_max',
    'carts_user': 'carts_user_period_max',
    'purchases_user': 'purchases_user_period_max',
    'favourites_user': 'favourites_user_period_max'
})
df = df.merge(to_merge_aux, on='user_id', how='left')
to_merge_aux = to_merge.groupby('user_id').mean().drop('time_period', axis=1).rename(columns={
    'clicks_user': 'clicks_user_period_mean',
    'carts_user': 'carts_user_period_mean',
    'purchases_user': 'purchases_user_period_mean',
    'favourites_user': 'favourites_user_period_mean'
})
df = df.merge(to_merge_aux, on='user_id', how='left')
to_merge_aux = to_merge.groupby('user_id').std().drop('time_period', axis=1).rename(columns={
    'clicks_user': 'clicks_user_period_std',
    'carts_user': 'carts_user_period_std',
    'purchases_user': 'purchases_user_period_std',
    'favourites_user': 'favourites_user_period_std'
}).fillna(0)
df = df.merge(to_merge_aux, on='user_id', how='left')
to_merge_aux = to_merge.groupby('user_id').median().drop('time_period', axis=1).rename(columns={
    'clicks_user': 'clicks_user_period_median',
    'carts_user': 'carts_user_period_median',
    'purchases_user': 'purchases_user_period_median',
    'favourites_user': 'favourites_user_period_median'
})
df = df.merge(to_merge_aux, on='user_id', how='left')

to_merge = to_merge.groupby(['user_id', 'time_period']).sum().unstack().fillna(0).stack().reset_index()
to_merge = to_merge.pivot_table(values=['clicks_user', 'carts_user', 'purchases_user', 'favourites_user'], index='user_id', columns='time_period')
to_merge.columns = ['_period_'.join(str(x) for x in col) for col in to_merge.columns.values]
df = df.merge(to_merge, on='user_id', how='left')

Merchants

In [12]:
merchants_time = df_user_log.groupby(['merchant_id', 'time_period'])

to_merge = merchants_time['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_merchant',
    1: 'carts_merchant',
    2: 'purchases_merchant',
    3: 'favourites_merchant'
    }).reset_index()

to_merge_aux = to_merge.groupby('merchant_id').max().drop('time_period', axis=1).rename(columns={
    'clicks_merchant': 'clicks_merchant_period_max',
    'carts_merchant': 'carts_merchant_period_max',
    'purchases_merchant': 'purchases_merchant_period_max',
    'favourites_merchant': 'favourites_merchant_period_max'
})
df = df.merge(to_merge_aux, on='merchant_id', how='left')
to_merge_aux = to_merge.groupby('merchant_id').mean().drop('time_period', axis=1).rename(columns={
    'clicks_merchant': 'clicks_merchant_period_mean',
    'carts_merchant': 'carts_merchant_period_mean',
    'purchases_merchant': 'purchases_merchant_period_mean',
    'favourites_merchant': 'favourites_merchant_period_mean'
})
df = df.merge(to_merge_aux, on='merchant_id', how='left')
to_merge_aux = to_merge.groupby('merchant_id').std().drop('time_period', axis=1).rename(columns={
    'clicks_merchant': 'clicks_merchant_period_std',
    'carts_merchant': 'carts_merchant_period_std',
    'purchases_merchant': 'purchases_merchant_period_std',
    'favourites_merchant': 'favourites_merchant_period_std'
}).fillna(0)
df = df.merge(to_merge_aux, on='merchant_id', how='left')
to_merge_aux = to_merge.groupby('merchant_id').median().drop('time_period', axis=1).rename(columns={
    'clicks_merchant': 'clicks_merchant_period_median',
    'carts_merchant': 'carts_merchant_period_median',
    'purchases_merchant': 'purchases_merchant_period_median',
    'favourites_merchant': 'favourites_merchant_period_median'
})
df = df.merge(to_merge_aux, on='merchant_id', how='left')

to_merge = to_merge.groupby(['merchant_id', 'time_period']).sum().unstack().fillna(0).stack().reset_index()
to_merge = to_merge.pivot_table(values=['clicks_merchant', 'carts_merchant', 'purchases_merchant', 'favourites_merchant'], index='merchant_id', columns='time_period')
to_merge.columns = ['_period_'.join(str(x) for x in col) for col in to_merge.columns.values]
df = df.merge(to_merge, on='merchant_id', how='left')

Users-Merchants

In [13]:
users_merchants_time = df_user_log.groupby(['user_id', 'merchant_id', 'time_period'])

to_merge = users_merchants_time['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user_merchant',
    1: 'carts_user_merchant',
    2: 'purchases_user_merchant',
    3: 'favourites_user_merchant'
    }).reset_index()
to_merge = to_merge.groupby(['user_id', 'merchant_id', 'time_period']).sum().unstack().fillna(0).stack().reset_index()
to_merge = to_merge.pivot_table(values=['clicks_user_merchant', 'carts_user_merchant', 'purchases_user_merchant', 'favourites_user_merchant'], index=['user_id', 'merchant_id'], columns='time_period')
to_merge.columns = ['_period_'.join(str(x) for x in col) for col in to_merge.columns.values]
df = df.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

Unnamed: 0_level_0,Unnamed: 1_level_0,carts_user_merchant_period_0,carts_user_merchant_period_1,carts_user_merchant_period_2,carts_user_merchant_period_3,carts_user_merchant_period_4,carts_user_merchant_period_5,clicks_user_merchant_period_0,clicks_user_merchant_period_1,clicks_user_merchant_period_2,clicks_user_merchant_period_3,...,favourites_user_merchant_period_2,favourites_user_merchant_period_3,favourites_user_merchant_period_4,favourites_user_merchant_period_5,purchases_user_merchant_period_0,purchases_user_merchant_period_1,purchases_user_merchant_period_2,purchases_user_merchant_period_3,purchases_user_merchant_period_4,purchases_user_merchant_period_5
user_id,merchant_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,1156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424170,1082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
424170,3469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
424170,3736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
424170,4268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Double11 Features

In [14]:
to_merge = (df_user_log[df_user_log['time_stamp'] == DOUBLE11_DAY]).reset_index(drop=True)
to_merge = to_merge[to_merge['action_type'] == 2].reset_index(drop=True).groupby('user_id').size().reset_index()
df = df.merge(to_merge, on='user_id', how='left').rename(columns={0: 'double11_purchases'})

df['double11_ratio'] = df['double11_purchases'] / df['purchases_user']

### Max, Mean, Standard deviation and Median on user-merchant actions (grouping by users)

In [15]:
clmns = ['clicks_user_merchant', 'carts_user_merchant', 'purchases_user_merchant', 'favourites_user_merchant']

to_merge = df.groupby('user_id')[clmns].max().rename(columns={
    'clicks_user_merchant': 'clicks_user_merchant_max',
    'carts_user_merchant': 'carts_user_merchant_max',
    'purchases_user_merchant': 'purchases_user_merchant_max',
    'favourites_user_merchant': 'favourites_user_merchant_max'
})
df = df.merge(to_merge, on='user_id', how='left')

to_merge = df.groupby('user_id')[clmns].mean().rename(columns={
    'clicks_user_merchant': 'clicks_user_merchant_mean',
    'carts_user_merchant': 'carts_user_merchant_mean',
    'purchases_user_merchant': 'purchases_user_merchant_mean',
    'favourites_user_merchant': 'favourites_user_merchant_mean'
})
df = df.merge(to_merge, on='user_id', how='left')

to_merge = df.groupby('user_id')[clmns].std().rename(columns={
    'clicks_user_merchant': 'clicks_user_merchant_std',
    'carts_user_merchant': 'carts_user_merchant_std',
    'purchases_user_merchant': 'purchases_user_merchant_std',
    'favourites_user_merchant': 'favourites_user_merchant_std'
}).fillna(0)
df = df.merge(to_merge, on='user_id', how='left')

to_merge = df.groupby('user_id')[clmns].median().rename(columns={
    'clicks_user_merchant': 'clicks_user_merchant_median',
    'carts_user_merchant': 'carts_user_merchant_median',
    'purchases_user_merchant': 'purchases_user_merchant_median',
    'favourites_user_merchant': 'favourites_user_merchant_median'
})
df = df.merge(to_merge, on='user_id', how='left')

### Max, Mean, Standard deviation and Median on user-merchant actions (grouping by merchants)

In [16]:
to_merge = df.groupby('merchant_id')[clmns].max().rename(columns={
    'clicks_user_merchant': 'clicks_merchant_user_max',
    'carts_user_merchant': 'carts_merchant_user_max',
    'purchases_user_merchant': 'purchases_merchant_user_max',
    'favourites_user_merchant': 'favourites_merchant_user_max'
})
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = df.groupby('merchant_id')[clmns].mean().rename(columns={
    'clicks_user_merchant': 'clicks_merchant_user_mean',
    'carts_user_merchant': 'carts_merchant_user_mean',
    'purchases_user_merchant': 'purchases_merchant_user_mean',
    'favourites_user_merchant': 'favourites_merchant_user_mean'
})
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = df.groupby('merchant_id')[clmns].std().rename(columns={
    'clicks_user_merchant': 'clicks_merchant_user_std',
    'carts_user_merchant': 'carts_merchant_user_std',
    'purchases_user_merchant': 'purchases_merchant_user_std',
    'favourites_user_merchant': 'favourites_merchant_user_std'
}).fillna(0)
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = df.groupby('merchant_id')[clmns].median().rename(columns={
    'clicks_user_merchant': 'clicks_merchant_user_median',
    'carts_user_merchant': 'carts_merchant_user_median',
    'purchases_user_merchant': 'purchases_merchant_user_median',
    'favourites_user_merchant': 'favourites_merchant_user_median'
})
df = df.merge(to_merge, on='merchant_id', how='left')

### PCA Features

In [22]:
df.columns[df.isna().any()].tolist()

['label',
 'clicks_user_period_std',
 'carts_user_period_std',
 'purchases_user_period_std',
 'favourites_user_period_std',
 'clicks_merchant_period_std',
 'carts_merchant_period_std',
 'purchases_merchant_period_std',
 'favourites_merchant_period_std']

In [17]:
# PCA features
pca_df = df.drop(['kind', 'label'], axis=1)
pca = PCA(n_components=PCA_COMPONENTS)
pca.fit(pca_df)
df = df.join(pd.DataFrame(pca.transform(pca_df), index=pca_df.index).add_prefix('pca_'))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Finishing Touches

In [None]:
df_train = df[df['kind'] == 'train'].drop(['kind'], axis=1)
df_test = df[df['kind'] == 'test'].drop(['kind', 'label'], axis=1)
X, y = df_train.drop(columns='label'), df_train['label']

In [19]:
X

NameError: name 'X' is not defined

## **Models**

### Fetch best features
The xgboost object will train the model with all features, then the booster object (returned after training) can calculate which features best contribute for most information gain.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=VALID_SET_SIZE, random_state=RANDOM_SEED)

dtrain = DMatrix(X_train, label=y_train)
dvalid = DMatrix(X_valid, label=y_valid)
watchlist = [(dvalid, 'valid')]
params = {
    'max_depth': 7,
    'min_child_weight': 200, 
    'colsample_bytree': 0.8, 
    'subsample': 0.8, 
    'eta': 0.04,    
    'seed': RANDOM_SEED,
    'eval_metric': 'auc'
}
booster = train(params, dtrain, num_boost_round=2000, evals=watchlist, early_stopping_rounds=50, verbose_eval=False)
best_features = pd.DataFrame(booster.get_score(importance_type='gain').items(), columns=['features', 'importance'])['features'].to_numpy()
X = X[best_features]

In [None]:
X

### Models Setup

In [None]:
# TODO tuning
models = {
    # 'RandomForestClassifier': [RandomForestClassifier, {
    #     'oob_score': True, 
    #     'n_estimators': 1000, 
    #     'max_depth': 10, 
    #     'max_features': 'sqrt',
    #     'n_jobs': -1
    # }],
    'CatBoostClassifier': [CatBoostClassifier, {
        'depth': 6,
        'learning_rate': 0.05,
        'iterations': 1200,
        'eval_metric': 'AUC',
        'random_state': RANDOM_SEED,
        'thread_count': 8,
        'silent': True
    }],
    'LGBMClassifier': [LGBMClassifier, {
        'n_estimators': 2000,
        'max_depth': 8,
        'num_leaves': 50,
        'learning_rate': 0.03,
        'reg_lambda': 1,
        'objective': 'binary',
        'metric': ['auc'],
        'random_state': RANDOM_SEED,
        'n_jobs': -1
    }],
    'XGBClassifier': [XGBClassifier, {
        'max_depth': 7,
        'n_estimators': 1000,
        'min_child_weight': 200,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'eta': 0.04,
        'objective': 'binary:logistic',
        'use_label_encoder': False,
        'seed': RANDOM_SEED
    }]
}

### Training + Cross Validation (10Fold)

In [None]:
records = {}
X, y = X.to_numpy(), y.to_numpy() 

for name, model_params in models.items():

    _class, params = model_params
    model_records = {'best_score': 0, 'scores': []}

    kf = KFold(n_splits=SPLITS, shuffle=True, random_state=RANDOM_SEED)

    for train_index, valid_index in kf.split(X):
    
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        model = _class(**params)

        fit_params = {
            'RandomForestClassifier': {}, 
            'CatBoostClassifier': {},
            'LGBMClassifier': {},
            'XGBClassifier': {
                'eval_metric': 'auc',
                'eval_set': [(X_train, y_train), (X_valid, y_valid)],
                'early_stopping_rounds': 50,
                'verbose': False
            }
        }

        model.fit(X_train, y_train, **fit_params[name])
        predictions = model.predict_proba(X_valid)[:,1]
        model_records['scores'].append(roc_auc_score(y_valid, predictions))
        if model_records['scores'][-1] > model_records['best_score']:
            model_records['best_score'] = model_records['scores'][-1]
            model_records['best_instance'] = model

    records[name] = model_records

    print(f'% {name} %')
    print('mean score: {0:.4f}'.format(np.mean(model_records['scores'])))
    print('best score: {0:.4f}'.format(model_records['best_score']))

## **Submission**

### Ensemble Predictions from Best Model Instances

#### Weights calculated from the normalized model scores

In [None]:
best_instances = [
    records['CatBoostClassifier']['best_instance'],
    records['LGBMClassifier']['best_instance'],
    records['XGBClassifier']['best_instance']
]

best_scores = [
    records['CatBoostClassifier']['best_score'],
    records['LGBMClassifier']['best_score'],
    records['XGBClassifier']['best_score']
]
best_scores.append(np.min(best_scores)*0.99)

weights = (best_scores-np.min(best_scores))/(np.max(best_scores)-np.min(best_scores))

prob_submission = np.zeros(df_test.shape[0])
for i in range(len(best_instances)):
    prob_submission += best_instances[i].predict_proba(df_test[best_features].to_numpy())[:, 1]*weights[i]
prob_submission = prob_submission/np.sum(weights)

#### Weights calculated manually

In [None]:
weights = {
    'CatBoostClassifier': 0.3,
    'LGBMClassifier': 0.1,
    'XGBClassifier': 0.6
}

prob_submission = np.zeros(df_test.shape[0])
for name, weight in weights.items():
    prob_submission += records[name]['best_instance'].predict_proba(df_test[best_features].to_numpy())[:, 1]*weight

### Print predictions to CSV

In [None]:
submission = df_test.iloc[:,:2].join(pd.DataFrame(prob_submission, index=df_test.index).rename(columns={0:'prob'}))
submission.to_csv('submission.csv', index=False)