In [2]:
import pandas as pd
import polars as pl
pl.Config.set_tbl_cols(None)
pl.Config.set_tbl_rows(None)
pl.enable_string_cache()
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
from sklearn.base import BaseEstimator, RegressorMixin
from collections import Counter
import gc


dataPath = "C:/Users/Marcel/Documents/Python/kaggle/CreditRiskModel/data/csv_files/"

In [100]:
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df



class Preprocess:

    def _numeric_readed_as_string(df: pl.Series, col:str) -> pl.Series:
        _missed_boolen = pl.Series(["false", 'true']).is_in(df.get_column(col).str.to_lowercase())
        if  _missed_boolen[0] == True or _missed_boolen[1] == True:
            df = df.with_columns(pl.col(col).str.to_lowercase().str.replace('false',False).str.replace('true', True).cast(pl.Float64).alias(col))
            df = df.with_columns(pl.col(col).str.replace('"','').alias(col))
        return df
    
    def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
        numeric_col =[]
        timedue_col = []
        string_col = []
        date_col = []
        for col in df.columns:
            if col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
                numeric_col.append(col)
            elif col[-1] in ("T"):
                try:
                    df = df.with_columns(pl.col(col).cast(pl.Int64).alias(col))
                    timedue_col.append(col)
                except:
                     df = df.with_columns(pl.col(col).cast(pl.Utf8).alias(col))
                     string_col.append(col)
            elif col[-1] in ("L"):
                try:
                    if df.get_column(col).dtype in [pl.Utf8]:
                       # for instance "False", or '"2.0"'
                       df = Preprocess._numeric_readed_as_string(df, col)
                    df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
                    numeric_col.append(col)
                except:
                    df = df.with_columns(pl.col(col).cast(pl.Utf8).alias(col))
                    string_col.append(col)
            elif col[-1] in ("D") or col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
                date_col.append(col)
            elif col[-1] in ("M"):
                df = df.with_columns(pl.col(col).cast(pl.Utf8).alias(col))
                string_col.append(col)
            elif col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64).alias(col))
        return df, [numeric_col,timedue_col,string_col,date_col]
    
    def offset_dates_by_decision_date(df: pl.DataFrame) -> pl.DataFrame:
        decision_data = df.get_column("date_decision")
        for col in df:
            if col.name[-1] in ["D"]:
                df = df.with_columns((col - pl.col("date_decision")).dt.days().cast(pl.Int64).alias(col.name))
        df = df.drop("date_decision","MONTH")
        return df
    
    def remove_columns_over_null_limit(df: pl.DataFrame) -> pl.DataFrame:
        drop_list = []
        for col in df:
            if (col.name not in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]) and (col.null_count()/len(col) > 0.7):
                drop_list.append(col.name)
        df = df.drop(drop_list)
        return df, drop_list
    
    def remove_columns_over_max_cardinality(df: pl.DataFrame) -> pl.DataFrame:
        drop_list = []
        for col in df:
            if (col.name not in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]) and (col.dtype == pl.Utf8) and (col.n_unique() > 200):
                drop_list.append(col.name)
        df = df.drop(drop_list)
        return df, drop_list



In [72]:
df = pl.DataFrame(
    {
        "binary": ['"2.0"','1.0', None,"3.0",'4.0']
    }
)

In [None]:
print(df)
df = df.with_columns(pl.col("binary").cast(pl.Float64))
print(df)

In [None]:
s = pl.Series(["False", None,"False","True", None])
_missed_boolen = pl.Series(["false", 'true']).is_in(s.str.to_lowercase())
if  _missed_boolen[0] == True or _missed_boolen[1] == True:
    s = s.str.to_lowercase().str.replace('false',False).str.replace('true', True).cast(pl.Float64)

    print(s)


In [78]:
class Agregator:

    def aggreget(df: pl.DataFrame, column_types) -> pl.DataFrame:
        numeric_col, timedue_col, string_col, date_col = column_types
        
        df = df.group_by('case_id').agg(pl.col(numeric_col).sum(), pl.col(timedue_col).mean(),pl.col(string_col).mode(),pl.col(date_col).mean())
        for col in string_col:
            df = df.with_columns(pl.col(col).list.first().alias(col))

        return df

In [10]:
class Loader:
    def _count_files_for_concat(all_files: list) -> dict:
        all_files = [filename[:-5] for filename in all_files]
        count =  Counter(all_files)
        return count


    def load_data_by_depth(paths_list: list, depth: int) -> list:
        files_list = []
        files_count = Loader._count_files_for_concat(paths_list)
        total_index = 0 
        for file_name, count in files_count.items():
            temp_storage_for_concat = []
            
            for idx in range(count):
                file_path = dataPath + paths_list[total_index].split('_',2)[0] + '/' + paths_list[total_index]
                file, column_types = Preprocess.set_table_dtypes(pl.read_csv(file_path))
                if depth > 0 :
                    temp_storage_for_concat.append(Agregator.aggreget(file, column_types))
                total_index += 1
            file = pl.concat(temp_storage_for_concat, how="vertical_relaxed")
            file,_ = Preprocess.remove_columns_over_null_limit(file)
            file,_ = Preprocess.remove_columns_over_max_cardinality(file)
            files_list.append(file, how="vertical_relaxed")
            gc.collect()
        return files_list

    def load_files(paths_dictionary: dict ) -> list:
        files = []
        for depth in paths_dictionary:
            files.append(Loader.load_data_by_depth(paths_dictionary[depth], depth))
        return files
    
    def join_tables():
        #TO DO 

In [105]:
data_path_train = {0: ["train_static_0_0.csv","train_static_0_1.csv","train_static_cb_0.csv"],
                   1: ["train_applprev_1_0.csv","train_applprev_1_1.csv","train_other_1.csv","train_tax_registry_a_1.csv", "train_tax_registry_b_1.csv","train_tax_registry_c_1.csv",
                              "train_credit_bureau_a_1_0.csv","train_credit_bureau_a_1_1.csv","train_credit_bureau_a_1_2.csv","train_credit_bureau_a_1_3.csv","train_credit_bureau_b_1.csv",
                              "train_deposit_1.csv","train_person_1.csv","train_debitcard_1.csv"],
                    2: ["train_applprev_2.csv","train_person_2.csv","train_credit_bureau_a_2_0.csv","train_credit_bureau_a_2_1.csv","train_credit_bureau_a_2_2.csv","train_credit_bureau_a_2_3.csv",
                             "train_credit_bureau_a_2_4.csv","train_credit_bureau_a_2_5.csv","train_credit_bureau_a_2_6.csv","train_credit_bureau_a_2_7.csv","train_credit_bureau_a_2_8.csv","train_credit_bureau_a_2_9.csv",
                             "train_credit_bureau_a_2_10.csv","train_credit_bureau_b_2.csv"]}

data_path_test = {0: ["test_static_0_0.csv","test_static_0_1.csv","test_static_0_2.csv","test_static_cb_0.csv"],
                  1:  ["test_applprev_1_0.csv","test_applprev_1_1.csv","test_applprev_1_2.csv", "test_other_1.csv", "test_tax_registry_a_1.csv","test_tax_registry_b_1.csv","test_tax_registry_c_1.csv",
                              "test_credit_bureau_a_1_0.csv","test_credit_bureau_a_1_1.csv","test_credit_bureau_a_1_2.csv","test_credit_bureau_a_1_3.csv","test_credit_bureau_a_1_4.csv","test_credit_bureau_b_1.csv",
                              "test_deposit_1.csv","test_person_1.csv","test_debitcard_1.csv"],
                  2: ["test_applprev_2.csv","test_person_2.csv","test_credit_bureau_a_2_0.csv","test_credit_bureau_a_2_1.csv","test_credit_bureau_a_2_2.csv","test_credit_bureau_a_2_3.csv",
                             "test_credit_bureau_a_2_4.csv","test_credit_bureau_a_2_5.csv","test_credit_bureau_a_2_6.csv","test_credit_bureau_a_2_7.csv","test_credit_bureau_a_2_8.csv","test_credit_bureau_a_2_9.csv",
                             "test_credit_bureau_a_2_10.csv","test_credit_bureau_a_2_11.csv","test_credit_bureau_b_2.csv"]}


In [106]:
data_path_test[0]

['test_static_0_0.csv',
 'test_static_0_1.csv',
 'test_static_0_2.csv',
 'test_static_cb_0.csv']

In [None]:
# file = pl.read_csv(dataPath + 'train/' +  "train_applprev_1_0.csv")
        # print(file.estimated_size() /1024**2)
        # file, column_types = Preprocess.set_table_dtypes(file)
        # file = Agregator.aggreget(file, column_types)
        # file,_ = Preprocess.remove_columns_over_null_limit(file)
        # file,_ = Preprocess.remove_columns_over_max_cardinality(file)
        # print(file.estimated_size() /1024**2)
    
        

In [None]:
# from pympler import tracker
# from pympler.asizeof import asizeof

# a = pl.read_csv(dataPath + 'train/' +  "train_credit_bureau_a_2_2.csv")
# asizeof.asizeof(a)
# print(asizeof.asized(a, detail=1).format())

In [108]:
file.estimated_size() /1024**2

220.95586681365967

In [109]:

asizeof(file)

384

In [80]:
file.profile()

(shape: (1_003_757, 168)
 ┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
 │ case_id ┆ actualdpdt ┆ amtinstpa ┆ annuity_7 ┆ … ┆ lastrepay ┆ maxdpdins ┆ payvacati ┆ validfrom │
 │ ---     ┆ olerance_3 ┆ idbefduel ┆ 80A       ┆   ┆ ingdate_6 ┆ tldate_35 ┆ onpostpon ┆ _1069D    │
 │ i64     ┆ 44P        ┆ 24m_41871 ┆ ---       ┆   ┆ 96D       ┆ 46855D    ┆ e_4187118 ┆ ---       │
 │         ┆ ---        ┆ 15A       ┆ f64       ┆   ┆ ---       ┆ ---       ┆ D         ┆ date      │
 │         ┆ f64        ┆ ---       ┆           ┆   ┆ date      ┆ date      ┆ ---       ┆           │
 │         ┆            ┆ f64       ┆           ┆   ┆           ┆           ┆ date      ┆           │
 ╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
 │ 1482120 ┆ 0.0        ┆ 223.892   ┆ 4818.4    ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
 │ 1510688 ┆ 0.0        ┆ 29358.375 ┆ 3164.4001 ┆ … ┆ nul

In [66]:
df = pl.LazyFrame({"list": [[1,1],[2,2],[3,4]]})

In [76]:
df = df.with_columns(
    pl.col("list").list.first().alias("first_element")
)

In [77]:
df.collect()

list,first_element
list[i64],i64
"[1, 1]",1
"[2, 2]",2
"[3, 4]",3


In [None]:
files = Loader.load_files(data_path_train)


0

In [21]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv").pipe(Preprocess.set_table_dtypes)
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(Preprocess.set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(Preprocess.set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(Preprocess.set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(Preprocess.set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(Preprocess.set_table_dtypes) 

In [15]:
train_static.get_column('datefirstoffer_1144D').dtype

Date

In [22]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv").pipe(Preprocess.set_table_dtypes)
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(Preprocess.set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(Preprocess.set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(Preprocess.set_table_dtypes)
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(Preprocess.set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(Preprocess.set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(Preprocess.set_table_dtypes) 

## Feature engineering

In this part, we can see a simple example of joining tables via `case_id`. Here the loading and joining is done with polars library. Polars library is blazingly fast and has much smaller memory footprint than pandas. 

In [40]:
# # We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# # also num_group2 column.
# train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
#     pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
#     (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
# )

# # Here num_group1=0 has special meaning, it is the person who applied for the loan.
# train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
#     pl.col("num_group1") == 0
# ).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# # Here we have num_goup1 and num_group2, so we need to aggregate again.
# train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
#     pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
#     (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
# )

# # We will process in this examples only A-type and M-type columns, so we need to select them.
# selected_static_cols = []
# for col in train_static.columns:
#     if col[-1] in ("A", "M"):
#         selected_static_cols.append(col)
# print(selected_static_cols)

# selected_static_cb_cols = []
# for col in train_static_cb.columns:
#     if col[-1] in ("A", "M"):
#         selected_static_cb_cols.append(col)
# print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static, how="left", on="case_id")
# ).join(
#     train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
# ).join(
#     train_person_1_feats_1, how="left", on="case_id"
# ).join(
#     train_person_1_feats_2, how="left", on="case_id"
# ).join(
#     train_credit_bureau_b_2_feats, how="left", on="case_id"
# )

In [41]:
# test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
#     pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
#     (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
# )

# test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
#     pl.col("num_group1") == 0
# ).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
#     pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
#     (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
# )

data_submission = test_basetable.join(
    test_static, how="left", on="case_id")
# ).join(
#     test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
# ).join(
#     test_person_1_feats_1, how="left", on="case_id"
# ).join(
#     test_person_1_feats_2, how="left", on="case_id"
# ).join(
#     test_credit_bureau_b_2_feats, how="left", on="case_id"
# )

In [44]:
data = Preprocess.offset_dates_by_decision_date(data)
data, drop_list_null = Preprocess.remove_columns_over_null_limit(data)
data, drop_list_cardinality = Preprocess.remove_columns_over_max_cardinality(data)
data_submission = Preprocess.offset_dates_by_decision_date(data_submission)
data_submission = data_submission.drop(drop_list_null + drop_list_cardinality)

In [46]:
case_ids = data["case_id"].unique().shuffle(seed=1).to_frame()
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.Series) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train.get_column('case_id'))
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid.get_column('case_id'))
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test.get_column('case_id'))

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'credtype_322L', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastunpaid_3546854D', 'd

In [47]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 146)
Valid: (305332, 146)
Test: (305332, 146)


146

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [60]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)



Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.74842
[100]	valid_0's auc: 0.76209
[150]	valid_0's auc: 0.768235
[200]	valid_0's auc: 0.771855
[250]	valid_0's auc: 0.774768
[300]	valid_0's auc: 0.776959
[350]	valid_0's auc: 0.778861
[400]	valid_0's auc: 0.78061
[450]	valid_0's auc: 0.781888
[500]	valid_0's auc: 0.783015
[550]	valid_0's auc: 0.783849
[600]	valid_0's auc: 0.784704
[650]	valid_0's auc: 0.785314
[700]	valid_0's auc: 0.785955
Early stopping, best iteration is:
[707]	valid_0's auc: 0.78605


In [71]:
# class VotingClassifierLGB(RegressorMixin, BaseEstimator):
#     def __init__(self,number):
#         super().__init__()
#         self.estimators = []
#         self.number = number
#     def fit(self, X, y=None):
#         lgb_train = lgb.Dataset(X_train, label=y_train)
#         lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

#         params = {
#             "boosting_type": "gbdt",
#             "objective": "binary",
#             "metric": "auc",
#             "max_depth": 3,
#             "num_leaves": 31,
#             "learning_rate": 0.05,
#             "feature_fraction": 0.9,
#             "bagging_fraction": 0.8,
#             "bagging_freq": 5,
#             "n_estimators": 1000,
#             "verbose": -1,
#         }
#         for n in range(self.number):
#             self.estimators[n] = lgb.train(
#                 params,
#                 lgb_train,
#                 valid_sets=lgb_valid,
#                 callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
#             )
#         return self
    
#     def predict(self, X):
#         y_preds = [estimator.predict(X,num_iteration = estimator.best_iteration) for estimator in self.estimators]
#         return np.mean(y_preds, axis=0)

Evaluation with AUC and then comparison with the stability metric is shown below.

In [74]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

The AUC score on the train set is: 0.7986145291780247
The AUC score on the valid set is: 0.7860497183581987
The AUC score on the test set is: 0.78244892986483


In [50]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std, a, avg_gini, res_std

stability_score_train, a_train, avg_gini_train, res_std_train = gini_stability(base_train)
stability_score_valid, a_valid, avg_gini_valid, res_std_valid = gini_stability(base_valid)
stability_score_test, a_test, avg_gini_test, res_std_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train} a = {a_train} avg_gini = {avg_gini_train} res_std = {res_std_train}') 
print(f'The stability score on the valid set is: {stability_score_valid} a = {a_valid} avg_gini = {avg_gini_valid} res_std = {res_std_valid}') 
print(f'The stability score on the test set is: {stability_score_test} a = {a_test} avg_gini = {avg_gini_test} res_std = {res_std_test}')  

The stability score on the train set is: 0.5799976116248803 a = 0.0010694011873831138 avg_gini = 0.6016184213253379 res_std = 0.0432416194009153
The stability score on the valid set is: 0.5457655831137467 a = 0.001002324457445275 avg_gini = 0.5766775590318239 res_std = 0.061823951836154516
The stability score on the test set is: 0.5375198056551915 a = 0.0011045164417052156 avg_gini = 0.5702647563547265 res_std = 0.06548990139907004


## Submission

Scoring the submission dataset is below, we need to take care of new categories. Then we save the score as a last step. 

In [51]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [52]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

Best of luck, and most importantly, enjoy the process of learning and discovery! 

<img src="https://i.imgur.com/obVWIBh.png" alt="Image" width="700"/>