# Module

In [27]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation

In [5]:
data_path = "./data/"

train_df = pd.read_csv(data_path + "train.csv", index_col="id")
test_df = pd.read_csv(data_path + "test.csv", index_col="id")

sub_df = pd.read_csv(data_path + "sample_submission.csv", index_col="id")

In [6]:
print(train_df.shape, test_df.shape, sub_df.shape)

(595212, 58) (892816, 57) (892816, 1)


In [7]:
train_df.head()

Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
9,0,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
13,0,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
16,0,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


# Feature Engineering

In [9]:
all_data = pd.concat([train_df, test_df], ignore_index=True)
all_data.drop('target', axis=1, inplace=True)
all_features = all_data.columns
all_features

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

## Encoding

In [None]:
cat_features = [col for col in all_features if 'cat' in col]

In [11]:
onehot_encoder = OneHotEncoder()
cat_encoded = onehot_encoder.fit_transform(all_data[cat_features])

## Selecting Features

In [18]:
drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

remaining_features = [feature for feature in all_features 
                      if ('_cat' not in feature and '_calc' not in feature and feature not in drop_features)]

print(f"Number of remaining features: {len(remaining_features)}")

Number of remaining features: 17


In [19]:
cat_encoded.shape

(1488028, 184)

In [21]:
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data[remaining_features]), cat_encoded], format='csr')
all_data_sprs

<1488028x201 sparse matrix of type '<class 'numpy.float64'>'
	with 37628974 stored elements in Compressed Sparse Row format>

## Split

In [None]:
num_train = int(len(train_df))

x_train = all_data_sprs[:num_train]
x_test = all_data_sprs[num_train:]

y = train_df['target'].values

x_train.shape, x_test.shape, y.shape

((595212, 201), (892816, 201), (595212,))

# Model

## Eval Func

In [23]:
def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0] 
    L_mid = np.linspace(1 / n_samples, 1, n_samples)
    pred_order = y_true[y_pred.argsort()] 
    L_pred = np.cumsum(pred_order) / np.sum(pred_order) 
    G_pred = np.sum(L_mid - L_pred) 
    true_order = y_true[y_true.argsort()] 
    L_true = np.cumsum(true_order) / np.sum(true_order) 
    G_true = np.sum(L_mid - L_true)
    return G_pred / G_true


def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds), True

## OOF

In [25]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2025)

params = {
    'objective': 'binary',
    'learning_rate': 0.01,
    'force_row_wise': True,
    'random_state': 2025
}

oof_val_preds = np.zeros(x_train.shape[0])
oof_test_preds = np.zeros(x_test.shape[0])

## LGBM

In [30]:
%%time

for idx, (train_idx, val_idx) in enumerate(folds.split(x_train, y)):
    print('#'*40, f'Fold {idx+1} / {folds.n_splits}', '#'*40)
    
    X_train, y_train = x_train[train_idx], y[train_idx]
    X_val, y_val = x_train[val_idx], y[val_idx]
    
    dtrain = lgb.Dataset(X_train, y_train)
    dval = lgb.Dataset(X_val, y_val)
    
    
    
    lgb_model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=dval,
        feval=gini,
        callbacks=[
            early_stopping(stopping_rounds=100, verbose=True),
            log_evaluation(period=100)
        ]
    )
    
    oof_val_preds[val_idx] += lgb_model.predict(X_val)
    oof_test_preds += lgb_model.predict(x_test) / folds.n_splits
    
    gini_score = eval_gini(y_val, oof_val_preds[val_idx])
    print(f'Fold {idx+1} / {folds.n_splits} - Gini: {gini_score:.4f}')

######################################## Fold 1 / 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.15353	valid_0's gini: 0.247832
[200]	valid_0's binary_logloss: 0.15274	valid_0's gini: 0.260092
[300]	valid_0's binary_logloss: 0.152435	valid_0's gini: 0.265903
[400]	valid_0's binary_logloss: 0.152271	valid_0's gini: 0.269453
[500]	valid_0's binary_logloss: 0.152179	valid_0's gini: 0.271327
[600]	valid_0's binary_logloss: 0.152154	valid_0's gini: 0.271721
Early stopping, best iteration is:
[538]	valid_0's binary_logloss: 0.152158	valid_0's gini: 0.271808
Fold 1 / 5 - Gi

In [32]:
print(f'OOF Gini: {eval_gini(y, oof_val_preds):.4f}')

OOF Gini: 0.2813
