In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost.callback import EarlyStopping
from lightgbm import early_stopping


# Load Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Haversine function
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = (np.sin(dlat / 2.0)**2
         + np.cos(lat1)*np.cos(lat2)*np.sin(dlon / 2.0)**2)
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def preprocess_data(data, is_train=True, fraud_counts=None):
    if is_train:
        fraud_counts = data.groupby(['cc_num', 'is_fraud']).size().unstack(fill_value=0).reset_index()
        fraud_counts.columns = ['cc_num', 'is_fraud_0_count', 'is_fraud_1_count']
        fraud_counts['fraud_score'] = (fraud_counts['is_fraud_0_count'] * 10) - (fraud_counts['is_fraud_1_count'] * 50)

    data = data.merge(fraud_counts, on='cc_num', how='left')
    data['trans_datetime'] = pd.to_datetime(data['trans_date'] + ' ' + data['trans_time'])
    data['dob'] = pd.to_datetime(data['dob'], errors='coerce')
    data['age'] = (data['trans_datetime'] - data['dob']).dt.days / 365.25
    data['hour'] = data['trans_datetime'].dt.hour
    data['day'] = data['trans_datetime'].dt.day
    data['month'] = data['trans_datetime'].dt.month
    data['weekday'] = data['trans_datetime'].dt.weekday
    data['haversine_distance'] = haversine(data['lat'], data['long'], data['merch_lat'], data['merch_long'])

    features = [
        'amt', 'gender', 'category', 'job', 'state', 'city_pop',
        'hour', 'day', 'month', 'weekday',
        'age', 'haversine_distance'
    ]

    if is_train:
        features += ['is_fraud']

    data = data[features]

    # Gender map
    gender_map = {'F': 0, 'M': 1}
    data['gender'] = data['gender'].map(gender_map)

    # Target Encoding or Label Encoding for categorical columns
    categorical_cols = ['category', 'job', 'state']
    for col in categorical_cols:
        data[col] = data[col].astype(str)
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

    # Impute missing values
    imputer = SimpleImputer(strategy='median')
    data = pd.DataFrame(imputer.fit_transform(data), columns=features)

    if is_train:
        return data, fraud_counts
    else:
        return data


# Preprocess train
train_data, fraud_counts = preprocess_data(train_data, is_train=True)
X = train_data.drop('is_fraud', axis=1)
y = train_data['is_fraud']

# Preprocess test
test_data = preprocess_data(test_data, is_train=False, fraud_counts=fraud_counts)

# Ensure test has all columns in X
missing_cols = set(X.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[X.columns]

# Categorical columns for target encoding
# After imputation and mapping, we consider original categorical features
cat_cols = ['category', 'job', 'state']

# Apply Target Encoding on cat cols
te = TargetEncoder(cols=cat_cols)
te.fit(X[cat_cols], y)
X[cat_cols] = te.transform(X[cat_cols])
test_data[cat_cols] = te.transform(test_data[cat_cols])

# StratifiedKFold for out-of-fold predictions
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Base models
xgb_params = {
    'random_state': 42,
    'n_estimators': 300,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
}
lgb_params = {
    'random_state': 42,
    'n_estimators': 300
}
cat_params = {
    'random_state': 42,
    'iterations': 300,
    'verbose': False
}

xgb_model = xgb.XGBClassifier(**xgb_params)
lgb_model = lgb.LGBMClassifier(**lgb_params)
cat_model = CatBoostClassifier(**cat_params)

# Prepare arrays for out-of-fold predictions
oof_xgb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

callbacks = [EarlyStopping(rounds=50, save_best=True, maximize=False)]

# Train base models with OOF predictions
for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val_ = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val_ = y.iloc[train_idx], y.iloc[val_idx]

    # XGB
    xgb_model.set_params(early_stopping_rounds=50)
    xgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val_, y_val_)],
        verbose=False
    )
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val_)[:, 1]

    # LGB
    callbacks = [early_stopping(stopping_rounds=50, verbose=False)]
    lgb_model.fit(X_tr, y_tr, eval_set=[(X_val_, y_val_)], callbacks=callbacks)
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val_)[:, 1]

    # Cat
    cat_model.fit(X_tr, y_tr, eval_set=(X_val_, y_val_),
                  early_stopping_rounds=50, use_best_model=True)
    oof_cat[val_idx] = cat_model.predict_proba(X_val_)[:, 1]


# Stack model input: OOF predictions as features
stack_train = np.vstack([oof_xgb, oof_lgb, oof_cat]).T

# Fit a meta-classifier (Logistic Regression) on the OOF predictions
meta_model = LogisticRegression(random_state=42)
meta_model.fit(stack_train, y)
# Evaluate on a separate hold-out
# Let's do a final hold-out just for evaluation
X_train_, X_val_, y_train_, y_val_ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Re-train base models on full X_train_ and predict on X_val_ for final evaluation
xgb_model.fit(X_train_, y_train_, eval_set=[(X_val_, y_val_)], verbose=False) # Added eval_set
lgb_model.fit(X_train_, y_train_)
cat_model.fit(X_train_, y_train_)

val_xgb = xgb_model.predict_proba(X_val_)[:,1]
val_lgb = lgb_model.predict_proba(X_val_)[:,1]
val_cat = cat_model.predict_proba(X_val_)[:,1]

stack_val = np.vstack([val_xgb, val_lgb, val_cat]).T
val_pred = (meta_model.predict_proba(stack_val)[:,1] > 0.5).astype(int)

print("Classification Report (Validation):")
print(classification_report(y_val_, val_pred))
print("Confusion Matrix (Validation):")
print(confusion_matrix(y_val_, val_pred))
print("F1 Score (Validation):", f1_score(y_val_, val_pred))

# Disable early stopping for final training
xgb_model.set_params(early_stopping_rounds=None)
xgb_model.fit(X, y)

lgb_model.fit(X, y)
cat_model.fit(X, y)

# Generate predictions for the test set
test_xgb = xgb_model.predict_proba(test_data)[:, 1]
test_lgb = lgb_model.predict_proba(test_data)[:, 1]
test_cat = cat_model.predict_proba(test_data)[:, 1]

# Stack predictions for meta-classifier
test_stack = np.vstack([test_xgb, test_lgb, test_cat]).T
test_pred = (meta_model.predict_proba(test_stack)[:, 1] > 0.5).astype(int)

# Create submission file
submission = pd.DataFrame({'id': sample_submission['id'], 'is_fraud': test_pred})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 33839, number of negative: 262723
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 296562, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114104 -> initscore=-2.049486
[LightGBM] [Info] Start training from score -2.049486


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 33839, number of negative: 262723
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1402
[LightGBM] [Info] Number of data points in the train set: 296562, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114104 -> initscore=-2.049486
[LightGBM] [Info] Start training from score -2.049486


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 33839, number of negative: 262723
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 296562, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114104 -> initscore=-2.049486
[LightGBM] [Info] Start training from score -2.049486


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 33840, number of negative: 262723
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010845 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 296563, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114107 -> initscore=-2.049457
[LightGBM] [Info] Start training from score -2.049457


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 33839, number of negative: 262724
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1402
[LightGBM] [Info] Number of data points in the train set: 296563, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114104 -> initscore=-2.049490
[LightGBM] [Info] Start training from score -2.049490


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 33839, number of negative: 262723
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 296562, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114104 -> initscore=-2.049486
[LightGBM] [Info] Start training from score -2.049486
Classification Report (Validation):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     65681
         1.0       0.99      0.97      0.98      8460

    accuracy                           1.00     74141
   macro avg       0.99      0.98      0.99     74141
weighted avg       1.00      1.00      1.00     74141

Confusion Matrix (Validation):
[[65592    89]
 [  243  8217]]
F1 Score (Valid

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 42299, number of negative: 328404
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 370703, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114105 -> initscore=-2.049481
[LightGBM] [Info] Start training from score -2.049481
Submission file created: submission.csv
