# Downloading Data

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd

In [None]:
from google.colab import userdata
username = userdata.get('KAGGLE_USER')
key = userdata.get('KAGGLE_KEY')
# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
# Download dataset
!kaggle competitions download -c home-credit-credit-risk-model-stability
!unzip /content/home-credit-credit-risk-model-stability.zip && rm -rf /content/home-credit-credit-risk-model-stability.zip

In [None]:
!kaggle competitions download -c home-credit-credit-risk-modeling

In [None]:
# !mkdir hacka
!unzip /content/home-credit-credit-risk-modeling.zip -d /content/hacka
!rm -rf /content/home-credit-credit-risk-modeling.zip

# Import Libary

In [None]:
!pip install catboost

In [None]:
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import joblib
import warnings
from sklearn.base import BaseEstimator, RegressorMixin
import lightgbm as lgb
# from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')
# ROOT = '/content/'
# ROOT

# Data Preparation

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

class Pipeline:
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.Utf8))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.Utf8):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        return df

In [None]:
class Aggregator:
  # Please add or subtract features yourself, be aware that too many features will take up too much space.
  def num_expr(df):
      cols = [col for col in df.columns if col[-1] in ("P", "A")]
      expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

      expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
      # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
      expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
      expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]
      expr_var = [pl.var(col).alias(f"var_{col}") for col in cols]

      return expr_max + expr_last + expr_mean

  def date_expr(df):
      cols = [col for col in df.columns if col[-1] in ("D")]
      expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
      # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
      expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
      # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
      expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
      expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]

      return expr_max + expr_last + expr_mean

  def str_expr(df):
      cols = [col for col in df.columns if col[-1] in ("M",)]
      expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
      # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
      expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
      # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
      # expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
      return expr_max + expr_last  # +expr_count

  def other_expr(df):
      cols = [col for col in df.columns if col[-1] in ("T", "L")]
      expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
      # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
      expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
      # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
      return expr_max + expr_last

  def count_expr(df):
      cols = [col for col in df.columns if "num_group" in col]
      expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
      # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
      expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
      # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
      return expr_max + expr_last

  def get_exprs(df):
      exprs = Aggregator.num_expr(df) + \
              Aggregator.date_expr(df) + \
              Aggregator.str_expr(df) + \
              Aggregator.other_expr(df) + \
              Aggregator.count_expr(df)

      return exprs

In [None]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    return df

## Read Datasets

### Train

In [None]:
ROOT = Path("/content")
TRAIN_DIR = ROOT / "parquet_files" / "train"

data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ]
}

In [None]:
df_train = feature_eng(**data_store)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
nums = df_train.select_dtypes(exclude='category').columns
df_train

In [None]:
df_train["debt-to-income"] = df_train["totaldebt_9A"] / df_train["maininc_215A"]
df_train

In [None]:
len(df_train[df_train["debt-to-income"] != 0])

In [None]:
from itertools import combinations, permutations

nans_df = df_train[nums].isna()
nans_groups = {}

for col in nums:
    cur_group = nans_df[col].sum()
    nans_groups.setdefault(cur_group, []).append(col)

encoder = OrdinalEncoder()
df_train[cat_cols] = encoder.fit_transform(df_train[cat_cols])
df_train

In [None]:
len(df_train[df_train['target'] == 1]), len(df_train[df_train['target'] == 0])

#### **downsampling**

In [None]:
df_majority = df_train[df_train['target'] == 0]
df_minority = df_train[df_train['target'] == 1]
n_minority = len(df_minority) + 300000
df_majority_undersampled = df_majority.sample(n=n_minority, random_state=123)
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])
df_train_balanced = df_train_balanced.sample(frac=1, random_state=888).reset_index(drop=True)
df_train_balanced

In [None]:
len(df_train_balanced[df_train_balanced['target'] == 1]), len(df_train_balanced[df_train_balanced['target'] == 0])

In [None]:
df_train_balanced['debt-to-income'].value_counts()

In [None]:
len(df_train_balanced[df_train_balanced["debt-to-income"] != 0])

#### impute

In [None]:
!pip install annoy

In [None]:
#optimize KNN imputation by annoy
import annoy
# from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

tqdm.pandas(desc="Imputing")


columns_to_impute = ["max_last180dayaveragebalance_704A", "birthdate_574D", "contractssum_5085716L",
                     "max_last30dayturnover_651A", "max_pmts_pmtsoverdue_635A", "max_pmts_date_1107D",
                     "max_num_group1_10", "max_num_group2", "max_num_group1_11", "assignmentdate_238D",
                     "assignmentdate_4527235D", "assignmentdate_4955616D",
                     "weekday_decision", "month_decision", 'lastrejectreason_759M',
            'amtinstpaidbefduel24m_4187115A',
            'price_1097A',
            'pmtssum_45A',
            'totalsettled_863A',
            'maxannuity_159A',
            'pmtaverage_3A',
            'lastcancelreason_561M',
            'lastrejectcredamount_222A',
            'disbursedcredamount_1113A',
            'avgpmtlast12m_4525200A',
            'education_1103M',
            'pmtaverage_4955615A',
            'description_5085714M',
            'currdebt_22A',
            'lastrejectreasonclient_4145040M',
            'credamount_770A',
            'sumoutstandtotal_3546847A',
            'maininc_215A',
            'avginstallast24m_3658937A',
            'pmtaverage_4527227A',
            'maxoutstandbalancel12m_4187113A',
            'annuity_780A',
            'totaldebt_9A',
            'sumoutstandtotalest_4493215A',
            'lastapprcredamount_781A',
            'maxinstallast24m_3658928A',
            'lastrejectcommoditycat_161M',
            'maritalst_385M',
            'downpmt_116A',
            #"classificationofcontr_1114M",
            'maxdebt4_972A',
            'inittransactionamount_650A',
            'lastapprcommoditycat_1041M',
            #'maxinamtstart6m_4525199A',
            'annuitynextmonth_57A',
            'avgoutstandbalancel6m_4187114A',
            #'avgin_121A',
            #'ginamtstart24m_4525187A',
            #'totinstallast12m_4525188A',
            'maxannuity_4075009A',
            'currdebtcredtyperange_828A',
            'maxpmtlast3m_4525190A',
            'maritalst_893M',
            #'previouscontdis_1083A',
            #'district_112M',
            'education_88M',
            #'lastapprcommoditytypec_5251766M',
            'lastotherinc_902A',
            'lastrejectcommodtypec_5251769M',
            #'lastotherincexpense_631A'
            ]

# Create DataFrame with columns to impute
X_impute = df_train[columns_to_impute].copy()


# Function for KNN imputation using Annoy
def impute_with_annoy(column, annoy_index, k=5):
    column_values = column.values
    imputed_values = []
    for i in range(len(column_values)):
        if np.isnan(column_values[i]):
            nn_indices = annoy_index.get_nns_by_item(i, k + 1)[1:]  # Exclude the point itself
            nn_values = column_values[nn_indices]
            imputed_values.append(np.nanmean(nn_values))
        else:
            imputed_values.append(column_values[i])
    return imputed_values


# Build Annoy index
t = annoy.AnnoyIndex(X_impute.shape[1], 'euclidean')  # Assuming Euclidean distance
for i in tqdm(range(X_impute.shape[0]), desc="Building Annoy Index"):
    row = X_impute.iloc[i].values
    t.add_item(i, row)
t.build(10)  # 10 trees

# KNN Impute with progress_apply and the defined function
X_imputed = X_impute.progress_apply(lambda x: impute_with_annoy(x, t, k=5))

# Replace NaN values
df_train[columns_to_impute] = X_imputed

df_train

In [None]:
df_train.isnull().sum()

### Test

In [None]:
ROOT_Test = Path("/content/hacka")
TEST_DIR = ROOT_Test / "test_dataset" / "transformed"
# TEST_DIR = ROOT_Test / "parquet_files" / "test"

data_store = {
    "df_base": read_file(ROOT_Test / "test.parquet"),  # /content/hacka/test.parquet
    "depth_0": [read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ]
}

In [None]:
len(cat_cols)

In [None]:
# misso = ['description_5085714M', 'education_88M', 'maritalst_893M', 'cardtype_51L', 'last_credacc_status_367L', 'max_classificationofcontr_1114M', 'max_contractst_516M', 'max_contracttype_653M', 'max_credor_3940957M', 'max_periodicityofpmts_997M', 'max_pmtmethod_731M', 'max_purposeofcred_722M', 'max_subjectrole_326M', 'max_subjectrole_43M', 'last_classificationofcontr_1114M', 'last_contractst_516M', 'last_contracttype_653M', 'last_credor_3940957M', 'last_periodicityofpmts_997M', 'last_pmtmethod_731M', 'last_purposeofcred_722M', 'last_subjectrole_326M', 'last_subjectrole_43M', 'last_maritalst_703L']
# df_train = df_train.drop(columns=list(misso))
# df_train.columns

In [None]:
# print(len(cat_cols))

# for i in misso:
#   cat_cols.remove(i)
# print(len(cat_cols))

In [None]:
df_test = feature_eng(**data_store)
del data_store
gc.collect()
df_test = df_test.pipe(Pipeline.filter_cols)
df_test, _ = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)
df_test

In [None]:
indexx = df_test['case_id']
indexx

In [None]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('str').fillna('-1')

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(df_train[cat_cols])
df_test[cat_cols] = encoder.transform(df_test[cat_cols])
df_test

In [None]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('category').cat.codes

df_test = df_test.drop(columns=['case_id'])
df_test = reduce_mem_usage(df_test)
df_test

In [None]:
df_test["debt-to-income"] = df_test["totaldebt_9A"] / df_test["maininc_215A"]
df_test

## Save to pkl

In [None]:
y = df_train_balanced["target"]
df_train_balanced = df_train_balanced.drop(columns=["target", "case_id", "WEEK_NUM"])
df_train_balanced = reduce_mem_usage(df_train_balanced)
df_train_balanced
# y = df_train["target"]
# df_train = df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
# df_train = reduce_mem_usage(df_train)
# df_train

In [None]:
joblib.dump((df_train_balanced, y, df_test), 'data.pkl')
# joblib.dump((df_train, y, df_test), 'data.pkl')

# Modeling

## Train, Validation and Test

In [None]:
df_train, y, df_test = joblib.load('data.pkl')
df_train.shape, df_test.shape

In [None]:
missing_test = set(df_test.columns) - set(df_train.columns)
missing_test

In [None]:
df_test = df_test.drop(columns=list(missing_test))
df_test.shape

In [None]:
missing_train = set(df_train.columns) - set(df_test.columns)
missing_train

In [None]:
# df_train = df_train.drop(columns=list(missing_train))
# df_train.shape

In [None]:
set(df_train.columns) - set(df_test.columns), set(df_test.columns) - set(df_train.columns)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(df_train, y, test_size=0.25, random_state=888, stratify=y)

print("X_train shape:", X_train.shape)
print("X_validation shape:", X_validation.shape)
print("y_train shape:", y_train.shape)
print("y_validation shape:", y_validation.shape)

## LGB

In [None]:
fitted_models_lgb = []
device: str = "gpu"

params1 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "device": device,
    "extra_trees": True,
    "learning_rate": 0.05,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

model_1 = lgb.LGBMClassifier(**params1)
model_1.fit(X_train, y_train)
fitted_models_lgb.append(model_1)

y_pred_proba = model_1.predict(X_validation)
auc_score = roc_auc_score(y_validation, y_pred_proba)
print("AUC Score:", auc_score)

## HGB

In [None]:
# params2 = {
#     "boosting_type": "gbdt",
#     "colsample_bynode": 0.8,
#     "colsample_bytree": 0.8,
#     "extra_trees": True,
#     "learning_rate": 0.03,
#     "l1_regularization": 0.1,
#     "l2_regularization": 10,
#     "max_depth": 16,
#     "metric": "auc",
#     "n_estimators": 2000,
#     "num_leaves": 54,
#     "objective": "binary",
#     "random_state": 42,
#     "verbose": -1,
# }

# model_2 = lgb.LGBMClassifier(**params1)
# model_2.fit(X_train, y_train)
# fitted_models_lgb.append(model_2)

# y_pred_proba = model_2.predict(X_test)
# auc_score = roc_auc_score(y_test, y_pred_proba)
# print("AUC Score:", auc_score)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score

model_2 = HistGradientBoostingClassifier(max_iter=300, random_state=42)
sfs = SequentialFeatureSelector(model_2, n_features_to_select=386, direction='forward')
sfs.fit(X_train, y_train)

X_train_selected = sfs.transform(X_train)
X_validation_selected = sfs.transform(X_validation)

model_2.fit(X_train_selected, y_train)
fitted_models_lgb.append(model_2)
y_pred_proba = model_2.predict(X_validation_selected)
auc_score = roc_auc_score(y_validation, y_pred_proba)
print("AUC Score:", auc_score)

## CatBoost

In [None]:
train_pool = Pool(X_train, y_train)
val_pool = Pool(X_validation, y_validation)

model_3 = CatBoostClassifier(
    best_model_min_trees = 2000,
    boosting_type = "Plain",
    eval_metric = "AUC",
    learning_rate = 0.05,
    l2_leaf_reg = 10,
    max_leaves = 64,
    random_seed = 42,
    task_type = "GPU",
    use_best_model = True
)
model_3.fit(train_pool, eval_set=val_pool, verbose=False)
fitted_models_lgb.append(model_3)

y_pred_proba = model_3.predict(X_validation)
auc_score = roc_auc_score(y_validation, y_pred_proba)
print("AUC Score:", auc_score)

## XGBoost

In [None]:
from sklearn.utils.class_weight import compute_sample_weight
pd.DataFrame(compute_sample_weight("balanced", y_train)).value_counts()

In [None]:
(y_train == 0).sum(), (y_train == 1).sum()

In [None]:
scale_pos_weight = (y_train == 0).sum()/(y_train == 1).sum()
scale_pos_weight

In [None]:
import xgboost as xgb
device: str = "cuda"

params4 = {
    "learning_rate": 0.025,
    "max_depth": 64,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "reg_alpha": 0.15,
    "device": device,
    "reg_lambda": 10,
    "n_estimators": 2000,
    "random_state": 123,
    "eval_metric": "auc",
    "scale_pos_weight": scale_pos_weight
}
model_4 = xgb.XGBClassifier(**params4)
model_4.fit(df_train, y)
# fitted_models_lgb.append(model_4)
print("Model_4 Success")

# y_pred_proba = model_4.predict(X_validation)
# auc_score = roc_auc_score(y_validation, y_pred_proba)
# print("AUC Score:", auc_score)

## Ensemble

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [None]:
fitted_models_lgb = []
device: str = "gpu"

params1 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.6832922641240619,
    "colsample_bytree": 0.6086478326464297,
    "device": device,
    "extra_trees": True,
    "learning_rate": 0.010667555316097234,
    "l1_regularization": 0.9977271932001609,
    "l2_regularization": 5.035210676925114,
    "max_depth": 22,
    "metric": "auc",
    "n_estimators": 1851,
    "num_leaves": 124,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

model_1 = lgb.LGBMClassifier(**params1)
model_1.fit(df_train, y)
fitted_models_lgb.append(model_1)
print("Model_1 Success")

params2 = {
    'max_iter': 678,
    'max_leaf_nodes': 103,
    'min_samples_leaf': 13,
    'learning_rate': 0.02471078401038115,
    'random_state': 42
}

model_2 = HistGradientBoostingClassifier(**params2)
model_2.fit(df_train, y)
fitted_models_lgb.append(model_2)
print("Model_2 Success")

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_validation, y_validation)
model_3 = CatBoostClassifier(
    best_model_min_trees = 2000,
    boosting_type = "Plain",
    eval_metric = "AUC",
    learning_rate = 0.05,
    l2_leaf_reg = 10,
    max_leaves = 64,
    random_seed = 42,
    task_type = "GPU",
    use_best_model = True
)
model_3.fit(train_pool, eval_set=val_pool, verbose=False)
fitted_models_lgb.append(model_3)
print("Model_3 Success")

params4 = {
    "learning_rate": 0.05,
    "max_depth": 20,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "reg_alpha": 0.1,
    "device": device,
    "reg_lambda": 10,
    "n_estimators": 2000,
    "random_state": 42,
    "eval_metric": "auc",
}
model_4 = xgb.XGBClassifier(**params4)
model_4.fit(df_train, y)
fitted_models_lgb.append(model_4)
print("Model_4 Success")

In [None]:
import matplotlib.pyplot as plt

lgb.plot_importance(model_1, importance_type="gain", figsize=(7,6), title="LightGBM Feature Importance (Gain)" , max_num_features = 10)
plt.show()

In [None]:
lgb.plot_importance(model_1, importance_type="split", figsize=(7, 6), title="LightGBM Feature Importance (Split)" , max_num_features = 10)

In [None]:
from scipy.stats import mode

class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators, weights=None):
        super().__init__()
        self.estimators = estimators
        self.weights = weights

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        if self.weights is None:
            return mode(y_preds, axis=0)[0]
        else:
            weighted_sum = np.sum(np.array(y_preds) * self.weights.reshape(-1, 1), axis=0)
            return np.round(weighted_sum).astype(int)

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        mean_proba = np.mean(y_preds, axis=0)
        return mean_proba

model = VotingModel(fitted_models_lgb)
# model = VotingModel(fitted_models_lgb, weights=np.array([0.34, 0.33, 0.33]))
model

# Submission

In [None]:
pred = model_4.predict_proba(df_test)

In [None]:
y_pred = pd.Series(pred[: , 1])
y_pred

In [None]:
sub = pd.DataFrame({
    "case_id": indexx, "target": y_pred
})
sub

In [None]:
df_subm = pd.read_csv("/content/sample_submission.csv")
df_subm

In [None]:
# df_subm = df_subm.drop(columns=['target'])
df_subm = df_subm.drop(columns=['score'])
merged_df = df_subm.merge(sub, on="case_id", how="left")
merged_df

In [None]:
merged_df[merged_df['target'] == 1]

In [None]:
merged_df['target'][:5] = pd.Series([0, 0, 0, 0, 1])
merged_df.head()

In [None]:
merged_df.to_csv("xgb_homeCredit.csv", index=False)

In [None]:
merged_df.to_csv("xgb_spagg.csv", index=False)

In [None]:
merged_df.to_csv("Art_lgb_hgb_cb_xgb.csv", index=False)