In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
df = pd.read_csv('train.csv')

# Display basic info
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (200000, 202)


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [3]:
# Separate target
target = 'target'
X = df.drop(columns=[target, 'ID_code'])
y = df[target]

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [4]:
def manual_feature_engineering(data):
    data_fe = data.copy()
    
    # Basic stats
    data_fe['sum'] = data.sum(axis=1)
    data_fe['min'] = data.min(axis=1)
    data_fe['max'] = data.max(axis=1)
    data_fe['mean'] = data.mean(axis=1)
    data_fe['std'] = data.std(axis=1)
    data_fe['kurt'] = data.kurtosis(axis=1)
    data_fe['skew'] = data.skew(axis=1)
    
    return data_fe

X_train_manual = manual_feature_engineering(X_train)
X_val_manual = manual_feature_engineering(X_val)

In [6]:
from lightgbm import early_stopping, log_evaluation

lgb_train = lgb.Dataset(X_train_manual, y_train)
lgb_val = lgb.Dataset(X_val_manual, y_val, reference=lgb_train)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
}

model_lgb = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

preds_lgb = model_lgb.predict(X_val_manual)
auc_lgb = roc_auc_score(y_val, preds_lgb)
print("LightGBM ROC AUC (manual features):", auc_lgb)


Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.925185	valid_1's auc: 0.863834
[200]	training's auc: 0.955917	valid_1's auc: 0.881014
[300]	training's auc: 0.971856	valid_1's auc: 0.885992
[400]	training's auc: 0.982688	valid_1's auc: 0.887175
Early stopping, best iteration is:
[365]	training's auc: 0.979324	valid_1's auc: 0.887241
LightGBM ROC AUC (manual features): 0.8872413294837651


In [7]:
model_cb = CatBoostClassifier(verbose=100, iterations=1000, early_stopping_rounds=50, random_state=42)

model_cb.fit(X_train_manual, y_train, eval_set=(X_val_manual, y_val), use_best_model=True)

preds_cb = model_cb.predict_proba(X_val_manual)[:, 1]
auc_cb = roc_auc_score(y_val, preds_cb)
print("CatBoost ROC AUC (manual features):", auc_cb)

Learning rate set to 0.111001
0:	learn: 0.6024134	test: 0.6024766	best: 0.6024766 (0)	total: 255ms	remaining: 4m 14s
100:	learn: 0.2427616	test: 0.2502064	best: 0.2502064 (100)	total: 6.47s	remaining: 57.6s
200:	learn: 0.2148108	test: 0.2295421	best: 0.2295421 (200)	total: 12.8s	remaining: 51.1s
300:	learn: 0.1978579	test: 0.2199851	best: 0.2199851 (300)	total: 19.4s	remaining: 45.1s
400:	learn: 0.1853332	test: 0.2148243	best: 0.2148243 (400)	total: 26.2s	remaining: 39.1s
500:	learn: 0.1754215	test: 0.2119368	best: 0.2119368 (500)	total: 32.8s	remaining: 32.7s
600:	learn: 0.1669484	test: 0.2100560	best: 0.2100514 (599)	total: 39.4s	remaining: 26.1s
700:	learn: 0.1595212	test: 0.2090169	best: 0.2090169 (700)	total: 45.9s	remaining: 19.6s
800:	learn: 0.1529218	test: 0.2085543	best: 0.2085063 (791)	total: 52.3s	remaining: 13s
900:	learn: 0.1468501	test: 0.2084312	best: 0.2083781 (853)	total: 58.9s	remaining: 6.47s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.2083780

In [9]:
from lightgbm import early_stopping, log_evaluation

lgb_train_raw = lgb.Dataset(X_train, y_train)
lgb_val_raw = lgb.Dataset(X_val, y_val, reference=lgb_train_raw)

model_lgb_raw = lgb.train(
    params,
    lgb_train_raw,
    valid_sets=[lgb_train_raw, lgb_val_raw],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

preds_lgb_raw = model_lgb_raw.predict(X_val)
auc_lgb_raw = roc_auc_score(y_val, preds_lgb_raw)
print("LightGBM ROC AUC (raw features):", auc_lgb_raw)

Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.925107	valid_1's auc: 0.865232
[200]	training's auc: 0.955715	valid_1's auc: 0.881325
[300]	training's auc: 0.971674	valid_1's auc: 0.886506
[400]	training's auc: 0.982614	valid_1's auc: 0.887856
[500]	training's auc: 0.989646	valid_1's auc: 0.887808
Early stopping, best iteration is:
[487]	training's auc: 0.98886	valid_1's auc: 0.88804
LightGBM ROC AUC (raw features): 0.8880400180863333


In [10]:
model_cb_raw = CatBoostClassifier(verbose=100, iterations=1000, early_stopping_rounds=50, random_state=42)

model_cb_raw.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

preds_cb_raw = model_cb_raw.predict_proba(X_val)[:, 1]
auc_cb_raw = roc_auc_score(y_val, preds_cb_raw)
print("CatBoost ROC AUC (raw features):", auc_cb_raw)

Learning rate set to 0.111001
0:	learn: 0.6023568	test: 0.6024884	best: 0.6024884 (0)	total: 111ms	remaining: 1m 50s
100:	learn: 0.2427979	test: 0.2504760	best: 0.2504760 (100)	total: 6.35s	remaining: 56.5s
200:	learn: 0.2148352	test: 0.2298895	best: 0.2298895 (200)	total: 12.8s	remaining: 51s
300:	learn: 0.1979356	test: 0.2203881	best: 0.2203881 (300)	total: 19.3s	remaining: 44.9s
400:	learn: 0.1855955	test: 0.2149860	best: 0.2149860 (400)	total: 25.8s	remaining: 38.5s
500:	learn: 0.1756495	test: 0.2118155	best: 0.2118155 (500)	total: 32.2s	remaining: 32.1s
600:	learn: 0.1672135	test: 0.2098404	best: 0.2098404 (600)	total: 39s	remaining: 25.9s
700:	learn: 0.1596620	test: 0.2087979	best: 0.2087979 (700)	total: 45.8s	remaining: 19.6s
800:	learn: 0.1530722	test: 0.2081835	best: 0.2081719 (797)	total: 52.4s	remaining: 13s
900:	learn: 0.1468125	test: 0.2079369	best: 0.2078842 (894)	total: 59s	remaining: 6.48s
999:	learn: 0.1413233	test: 0.2078769	best: 0.2078026 (960)	total: 1m 5s	remainin

In [11]:
results = pd.DataFrame({
    'Model': ['LightGBM', 'CatBoost', 'LightGBM (Raw)', 'CatBoost (Raw)'],
    'ROC AUC': [auc_lgb, auc_cb, auc_lgb_raw, auc_cb_raw]
})

print(results.sort_values(by='ROC AUC', ascending=False))

            Model   ROC AUC
3  CatBoost (Raw)  0.894776
1        CatBoost  0.894034
2  LightGBM (Raw)  0.888040
0        LightGBM  0.887241
