# Setup kaggle api

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'home-credit-credit-risk-model-stability:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F50160%2F7921029%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240521%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240521T081657Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D194228c8056848eed92d0cde88d36347021113f50d8e6137369b3845b15399442b43805f970378239656c056352f616f80bf2a2f64fed490200a44684732db8336bf1d600598be7ff87748422d0b64c49ffe73a61d6fc51e7e7df0ed14028840b129b0194fcb79dd530253d7c8ecfaa18f217f00dd3b6fad9d5bdd5b250936ab874f6c08438b4da3de28708892f1da174d7d204f4aeba1d566fd4598d04d22139240d6414ac251e6a30085690089c5dac0c7e18ee34434ff8f9ee23423cb8229cd4ada320abec8754daccda1d8a329b7e68ddfe4eef7b4532dd3fe206a31b8c06d28779f59e330a141a0401d25ce2681eb438edf181603dc43f627d2be83f9a8,homecredit-models-public/other/lgb/1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F27710%2F33095%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240521%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240521T081657Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D372798d7ba35bce2fbf06da9879b814f524d215bdf96d7b939d660d0214f4e8599c84aa04d8cc130514d94c49a40a952cffc3de942db9d827e197bfafdfeaa5b868e2aa575abb66ac2f8950ed444f8b524560f8d36012950fc157ece79b704a88b2d708a3d7752900c3a5f28e7ea9efaca7a3817ea361332608cbbc51ae6d833c104e8961db593083b671592a4a170f2299e3a1b449bcd0da67b4ff65c0bbff5658d26426ff878f0db24746bd7ff23212ec9439677819277b2bfe0a75f14cf1c9ed372164727076408542a35d08efb56ff0b3eea4f1b2f1f1aeb93c1fa79c3d5cdea0422fe3c044f3c1567a4a2577214084c06d8b05250bbbcb5b50561b04ab9,homecredit-models-public/other/cat/1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F27711%2F33096%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240521%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240521T081657Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D13ccd3ccc303d419097616bd99eac50013654df7a6e1f3fcd4e003cadbbb1f288a774bfcc96c7a88fae374a8f079fa57eb2442e195732cb66b1efafe972f6c8a50ec21e7f5b964d589c468e7e7fb2290a598f22833d67b163c0474b06084a063318986df235e7c28f82a12e700c489a6d27c5556e7714c9a99a50ea232c2d0cf3baf807c9137571a3c021746555a66a243fa92f4f5153e898ee5eda7d23c453a67a8678204c59c9dce3e4253d73a0eeb09834ff02df2c0b62beec5adf106d510614197480e11a2830a5cebd8221f76cc8d607db6846bb334483f4b592a7a8023ec9bc245d797916bb0334bd1b9b1db418740a266644b645fcb3bb613d69b5a58'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

In [None]:
from google.colab import userdata
username = userdata.get('KAGGLE_USER')
key = userdata.get('KAGGLE_KEY')
# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

[Home Credit - Credit Risk Model Stability](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/overview)

In [None]:
# Download dataset
!kaggle competitions download -c home-credit-credit-risk-model-stability
!unzip /content/home-credit-credit-risk-model-stability.zip && rm -rf /content/home-credit-credit-risk-model-stability.zip

In [None]:
!mv /content/sample_submission.csv /content/sample_sub_og.csv

[Home Credit - Credit Risk Modeling](https://www.kaggle.com/competitions/home-credit-credit-risk-modeling/overview)

In [None]:
# Download dataset
!kaggle competitions download -c home-credit-credit-risk-modeling
!unzip /content/home-credit-credit-risk-modeling.zip && rm -rf /content/home-credit-credit-risk-modeling.zip

In [None]:
# !mv /content/sample_submission.csv /content/sample_sub_hack.csv

In [None]:
!mv /content/test.parquet /content/test_dataset/transformed

# 0.361

In [None]:
import sys  # System-specific parameters and functions
import subprocess  # Spawn new processes, connect to their input/output/error pipes, and obtain their return codes
import os  # Operating system dependent functionality
import gc  # Garbage Collector interface
from pathlib import Path  # Object-oriented filesystem paths
from glob import glob  # Unix style pathname pattern expansion

import numpy as np  # Fundamental package for scientific computing with Python
import pandas as pd  # Powerful data structures for data manipulation and analysis
import polars as pl  # Fast DataFrame library implemented in Rust

from datetime import datetime  # Basic date and time types
import seaborn as sns  # Statistical data visualization
import matplotlib.pyplot as plt  # MATLAB-like plotting framework

import joblib  # Save and load Python objects

import warnings  # Warning control

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


warnings.filterwarnings('ignore')  # Ignore warnings

In [None]:
df_train_base = pd.read_csv('/content/csv_files/train/train_base.csv')

display(df_train_base)

In [None]:
%%time
df_train_static= pd.read_csv('/content/csv_files/train/train_static_0_0.csv')
display(df_train_static)

In [None]:
%%time
pl_train_static = pl.read_csv('/content/csv_files/train/train_static_0_0.csv')
display(pl_train_static)

In [None]:
del df_train_static, pl_train_static, df_train_base

In [None]:
import gc  # Garbage Collector interface
gc.collect()

## 🛠️📊 **Pipeline for Data Preprocessing**
Let's create a class named Pipeline containing methods to preprocess data using Pandas and Pipelines.

#### **set_table_dtypes(df)**

- This method iterates through each column in the DataFrame (df) and converts the data types based on certain conditions.
- If the column name is one of ["case_id", "WEEK_NUM", "num_group1", "num_group2"], it converts the column to Int64.
- If the column name is "date_decision", it converts the column to Date.
- If the last character of the column name is "P" or "A", it converts the column to Float64.
- If the last character of the column name is "M", it converts the column to String.
- If the last character of the column name is "D", it converts the column to Date.
Finally, it returns the DataFrame with modified data types.


In [None]:
def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df


In [None]:
dataPath = os.getcwd()+'/'
dataPath

In [None]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes)
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes)

In [None]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes)
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes)

In [None]:
test_static.head()

In [None]:
len(test_static.columns)

## **Feature engineering**
In this part, we can see a simple example of joining tables via case_id. Here the loading and joining is done with polars library. Polars library is blazingly fast and has much smaller memory footprint than pandas.

In [None]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [None]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [None]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

In [None]:
def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

## **reduce_mem_usage(df)**

- Input:
  - df: Input DataFrame.
- Output: Returns the DataFrame with reduced memory usage.
- Process:
  - Calculates the initial memory usage of the DataFrame (start_mem) using df.memory_usage().
  - Iterates through each column of the DataFrame:
   - Checks if the column type is a category. If so, skips to the next column.
   - For non-category columns:
     - Determines the minimum and maximum values of the column (c_min and c_max).
     - If the column type is integer:
       - Checks if the data can be fit into int8, int16, int32, or int64 and converts the column type accordingly.
     - If the column type is float:
       - Checks if the data can be fit into float16, float32, or float64 and converts the column type accordingly.
     - If the column type is object (string), it skips the conversion.
  - Calculates the final memory usage of the DataFrame (end_mem) after the modifications.
- Returns the DataFrame with reduced memory usage.

#### Study Sources
- Optimizing memory usage in Pandas: [Optimizing Memory Usage in Pandas](https://www.dataquest.io/blog/pandas-big-data/)
- Understanding data types and memory in Pandas: [Pandas Data Types and Memory Usage](https://pbpython.com/pandas_dtypes.html)
- Data type conversion in NumPy: [NumPy Data Types](https://numpy.org/doc/stable/reference/arrays.scalars.html#arrays-scalars-built-in)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)
    df = reduce_mem_usage(df)

In [None]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)

In [None]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}')
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}')
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}')
print(f'The stability score on the valid set is: {stability_score_valid}')
print(f'The stability score on the test set is: {stability_score_test}')

In [None]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [None]:
y_submission_pred

In [None]:
X_submission

In [None]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

In [None]:
submission

# 0.653

In [None]:
import sys  # System-specific parameters and functions
import subprocess  # Spawn new processes, connect to their input/output/error pipes, and obtain their return codes
import os  # Operating system dependent functionality
import gc  # Garbage Collector interface
from pathlib import Path  # Object-oriented filesystem paths
from glob import glob  # Unix style pathname pattern expansion

import numpy as np  # Fundamental package for scientific computing with Python
import pandas as pd  # Powerful data structures for data manipulation and analysis
import polars as pl  # Fast DataFrame library implemented in Rust

from datetime import datetime  # Basic date and time types
import seaborn as sns  # Statistical data visualization
import matplotlib.pyplot as plt  # MATLAB-like plotting framework

import joblib  # Save and load Python objects

import warnings  # Warning control
warnings.filterwarnings('ignore')  # Ignore warnings

from sklearn.base import BaseEstimator, RegressorMixin  # Base classes for all estimators in scikit-learn
from sklearn.metrics import roc_auc_score  # ROC AUC score
import lightgbm as lgb  # LightGBM: Gradient boosting framework
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold  # Cross-validation strategies
from imblearn.over_sampling import SMOTE  # Oversampling technique for imbalanced datasets
from sklearn.preprocessing import OrdinalEncoder  # Encode categorical features as an integer array
from sklearn.impute import KNNImputer  # Imputation for completing missing values using k-Nearest Neighbors

In [None]:
!pip install catboost

## Pipeline for Data Preprocessing

In [None]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.Utf8)) # String
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        # for col in df.columns:
        #     if col not in ["target", "case_id", "WEEK_NUM"]:
        #         isnull = df[col].is_null().mean()
        #         if isnull > 0.7:
        #             df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.Utf8):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

In [None]:
class Aggregator:
    #Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max

    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max

    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max

    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max

    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [None]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

def read_files(regex_path, depth=None):
    chunks = []

    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)

    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## Data Preprocessing and Feature Engineering

In [None]:
ROOT_og = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TRAIN_DIR = ROOT_og / "parquet_files" / "train"
# TRAIN_DIR = "/content/parquet_files/train"

In [None]:
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        # read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2),
    ]
}


In [None]:
df_train = feature_eng(**data_store)
print(len(df_train.columns))

del data_store
gc.collect()

df_train = df_train.pipe(Pipeline.filter_cols)
print(len(df_train.columns))

df_train, cat_cols = to_pandas(df_train)
print(len(df_train.columns), len(cat_cols))

df_train = reduce_mem_usage(df_train)
# nums = df_train.select_dtypes(exclude='category').columns
df_train

In [None]:
# nans_df = df_train[nums].isna()
# print(len(nans_df.columns))

# nans_groups={}
# for col in nums:
#     cur_group = nans_df[col].sum()
#     try:
#         nans_groups[cur_group].append(col)
#     except:
#         nans_groups[cur_group]=[col]

encoder = OrdinalEncoder()
# del nans_df; x=gc.collect()
# print(nans_groups)
df_train[cat_cols] = encoder.fit_transform(df_train[cat_cols])
df_train

In [None]:
len(df_train[df_train['target'] == 1]), len(df_train[df_train['target'] == 0])

- downsampling

In [None]:
df_majority = df_train[df_train['target'] == 0]
df_minority = df_train[df_train['target'] == 1]
n_minority = len(df_minority)
df_majority_undersampled = df_majority.sample(n=n_minority, random_state=42)
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_train_balanced

In [None]:
len(df_train_balanced[df_train_balanced['target'] == 1]), len(df_train_balanced[df_train_balanced['target'] == 0])

### drop columns that not have in test (no need to run)

In [None]:
cols_drop = ['max_empl_employedtotal_800L', 'max_empl_industry_691L', 'max_relationshiptoclient_415T', 'max_relationshiptoclient_642T', 'max_remitter_829L']

In [None]:
df_train_balanced = df_train_balanced.drop(columns=cols_drop)

In [None]:
len(df_train_balanced.columns)

In [None]:
print(len(cat_cols))
for ele in cols_drop:
  cat_cols.remove(ele)
print(len(cat_cols))

## Reduce Group Columns

In [None]:
def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
        use.append(vx)
    return use



In [None]:
def group_columns_by_correlation(matrix, threshold=0.8):
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]

    return groups

In [None]:
uses=[]
for k,v in nans_groups.items():
    if len(v)>1:
            Vs = nans_groups[k]
            grps= group_columns_by_correlation(df_train[Vs], threshold=0.8)
            use=reduce_group(grps)
            uses=uses+use
    else:
        uses=uses+v

# Subset the DataFrame to keep only the selected columns
df_train = df_train[uses]
df_train.columns

## Data Preparation for Test Set

In [None]:
# ROOT_hack = Path("/kaggle/input/home-credit-credit-risk-modeling")

# TEST_DIR = ROOT_hack / "parquet_files" / "test"
# TEST_DIR = "/content/parquet_files/test"
TEST_DIR = "/content/test_dataset/transformed"    # put test.parquet in this folder

In [None]:
data_store = {
    "df_base": read_file(TEST_DIR + "/test.parquet"),
    "depth_0": [
        read_file(TEST_DIR + "/test_static_cb_0.parquet"),
        read_file(TEST_DIR + "/test_static_0_0.parquet"),
    ],
    "depth_1": [
        read_file(TEST_DIR + "/test_applprev_1_0.parquet", 1),
        read_file(TEST_DIR + "/test_credit_bureau_a_1_0.parquet", 1),
        read_file(TEST_DIR + "/test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR + "/test_debitcard_1.parquet", 1),
        read_file(TEST_DIR + "/test_deposit_1.parquet", 1),
        read_file(TEST_DIR + "/test_other_1.parquet", 1),
        read_file(TEST_DIR + "/test_person_1.parquet", 1),
        read_file(TEST_DIR + "/test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR + "/test_tax_registry_b_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR + "/test_applprev_2.parquet", 2),
        read_file(TEST_DIR + "/test_credit_bureau_a_2_0.parquet", 2),
        read_file(TEST_DIR + "/test_credit_bureau_b_2.parquet", 2),
        read_file(TEST_DIR + "/test_person_2.parquet", 2),
    ]
}

In [None]:
df_test = feature_eng(**data_store)
print(len(df_test.columns))
del data_store
gc.collect()
df_test = df_test.pipe(Pipeline.filter_cols)
print(len(df_test.columns))
df_test, _ = to_pandas(df_test, cat_cols)
print(len(df_test.columns))
df_test = reduce_mem_usage(df_test)
df_test

In [None]:
# for col in df_test.columns:
#     if df_test[col].dtype == 'object':
#         df_test[col] = df_test[col].astype('str').fillna('-1')

encoder = OrdinalEncoder()
encoder.fit(df_test[cat_cols])

In [None]:
len(df_train.columns), len(df_test.columns)

In [None]:
df_train.get('target').value_counts()

## Combining and Preparing Data for Modeling

In [None]:
y = df_train_balanced["target"]
df_train_balanced = df_train_balanced.drop(columns=["target", "case_id", "WEEK_NUM"])
df_train_balanced = reduce_mem_usage(df_train_balanced)
joblib.dump((df_train_balanced, y, df_test), 'data.pkl')

In [None]:
len(df_train_balanced.columns)

In [None]:
len((set(df_train_balanced.columns)).intersection(set(df_test.columns)))

In [None]:
df_train, y, df_test = joblib.load('/content/data.pkl')
df_train.shape, df_test.shape

In [None]:
indexx = df_test['case_id']
indexx

In [None]:
y.shape

In [None]:
miss_cols_test = list(set(df_test.columns) - set(df_train.columns))
miss_cols_test

In [None]:
df_test = df_test.drop(columns=miss_cols_test)
len(df_test.columns)

In [None]:
if set(df_test.columns) == set(df_train.columns):
  print(True)

In [None]:
import pandas as pd

# Assuming df_train and df_test are your DataFrames

# Get data types of each column in both DataFrames
train_dtypes = df_train.dtypes
test_dtypes = df_test.dtypes

# Find columns with different data types
different_dtypes = {col: (train_dtypes[col], test_dtypes[col]) for col in df_train.columns if train_dtypes[col] != test_dtypes[col]}

# Print the columns with different data types
# for col, (train_dtype, test_dtype) in different_dtypes.items():
#     print(f"Column '{col}' has different data types: Train -> {train_dtype}, Test -> {test_dtype}")

In [None]:
list(different_dtypes.keys()), len(different_dtypes)

In [None]:
train_sametype = df_train.drop(columns=list(different_dtypes.keys()))
test_sametype = df_test.drop(columns=list(different_dtypes.keys()))
len(train_sametype.columns), len(test_sametype.columns)

## Train Model

In [None]:
fitted_models_lgb = []

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}
mo_1 = lgb.LGBMClassifier(**params)
mo_1.fit(train_sametype, y)
fitted_models_lgb.append(mo_1)

In [None]:
from catboost import CatBoostClassifier

params = {
    "iterations": 1000,                  # Equivalent to n_estimators
    "depth": 3,                          # Equivalent to max_depth
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.0,                  # Equivalent to lambda_l2 in LightGBM
    "bootstrap_type": "Bernoulli",       # Equivalent to bagging
    "subsample": 0.8,                    # Equivalent to bagging_fraction
    "rsm": 0.9,                          # Equivalent to feature_fraction
    "verbose": 0,
    "eval_metric": "AUC",                # Equivalent to metric
    "random_seed": 42,                   # To ensure reproducibility
}

mo_2 = CatBoostClassifier(**params)
mo_2.fit(train_sametype, y)
fitted_models_lgb.append(mo_2)

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_lgb)
model

## Model Prediction

In [None]:
np.mean(model.predict_proba(test_sametype), axis=1)

In [None]:
model.predict_proba(test_sametype)[:,1]

In [None]:
y_pred = pd.Series(model.predict_proba(test_sametype)[:,0], index=test_sametype.index)
y_pred

In [None]:
sub = pd.DataFrame({
    "case_id": indexx, "target": y_pred
})
sub

In [None]:
df_subm = pd.read_csv("/content/sample_submission.csv")
df_subm

In [None]:
df_subm = df_subm.drop(columns=['target'])
merged_df = df_subm.merge(sub, on="case_id", how="left")
merged_df

In [None]:
merged_df.to_csv("lgb_cb_0.csv", index=False)

In [None]:
merged_df[merged_df['target'] == 1]