In [2]:
%load_ext lab_black

In [3]:
import os

os.chdir("../..")

In [4]:
import pandas as pd
from utils import constants

In [48]:
X = pd.read_parquet("resources/data/splitted/X_train.parquet")
y = pd.read_parquet("resources/data/splitted/y_train.parquet")

In [49]:
missing_columns_mask = X.isna().any(axis=0)
X.loc[:, missing_columns_mask].isna().mean()

emp_title                      0.054343
emp_length                     0.038570
mths_since_last_delinq         0.557768
mths_since_last_record         0.875112
revol_util                     0.000847
last_pymnt_d                   0.002140
last_credit_pull_d             0.000090
collections_12_mths_ex_med     0.000232
mths_since_last_major_derog    0.812575
tot_coll_amt                   0.251515
tot_cur_bal                    0.251515
total_rev_hi_lim               0.251515
dtype: float64

In [51]:
y.loc[X["last_pymnt_d"].isna()].drop_duplicates()

Unnamed: 0,loan_status
7,Charged Off


Missing data in `last_pymnt_d` is possibly due to not paying first installment. Let's set it to oldest date present in dataset and add additional column `first_inst`.

In [8]:
column = "last_pymnt_d"
X["first_inst"] = ~X[column].isna()
splitted_col = X.loc[~X[column].isna().values, column].str.split("-", expand=True)
min_yr = splitted_col[1].min()
min_mon = "Jan"
min_date = f"{min_mon}-{min_yr}"
min_date
X[column] = X[column].fillna(min_date)
X[column].isna().sum()

0

Analogous to previous column.

In [9]:
column = "last_credit_pull_d"
X["no_credit_pull"] = ~X[column].isna()
splitted_col = X.loc[~X[column].isna().values, column].str.split("-", expand=True)
min_yr = splitted_col[1].min()
min_mon = "Jan"
min_date = f"{min_mon}-{min_yr}"
min_date
X[column] = X[column].fillna(min_date)
X[column].isna().sum()

0

In [10]:
for column in constants.COLUMNS_TO_IMPUTE_MISSING_CATEGORY:
    X[column] = X[column].fillna("missing")

In [11]:
for column in constants.COLUMNS_TO_IMPUTE_0:
    X[column] = X[column].fillna(0)

In [12]:
X["mths_since_last_delinq"] = pd.qcut(
    X["mths_since_last_delinq"],
    [0, 0.25, 0.5, 0.75, 1],
    labels=["1_quant", "2_quant", "3_quant", "4_quant"],
)
X["mths_since_last_delinq"] = X["mths_since_last_delinq"].cat.add_categories(
    ["no_delinq"]
)
X["mths_since_last_delinq"] = X["mths_since_last_delinq"].fillna("no_delinq")

In [13]:
X["mths_since_last_record"] = pd.qcut(
    X["mths_since_last_record"],
    [0, 0.25, 0.5, 0.75, 1],
    labels=["1_quant", "2_quant", "3_quant", "4_quant"],
)
X["mths_since_last_record"] = X["mths_since_last_record"].cat.add_categories(
    ["no_record"]
)
X["mths_since_last_record"] = X["mths_since_last_record"].fillna("no_record")

In [14]:
X["mths_since_last_major_derog"] = pd.qcut(
    X["mths_since_last_major_derog"],
    [0, 0.25, 0.5, 0.75, 1],
    labels=["1_quant", "2_quant", "3_quant", "4_quant"],
)
X["mths_since_last_major_derog"] = X["mths_since_last_major_derog"].cat.add_categories(
    ["no_major_derog"]
)
X["mths_since_last_major_derog"] = X["mths_since_last_major_derog"].fillna(
    "no_major_derog"
)

In [15]:
X["revol_util"] = X["revol_util"].fillna(X["revol_util"].mean())

In [53]:
X[["revol_util"]].mean()

revol_util    54.288903
dtype: float64

In [16]:
missing_columns_mask = X.isna().any(axis=0)
X.loc[:, missing_columns_mask].isna().mean()

total_rev_hi_lim    0.251515
dtype: float64

In [17]:
object_columns = X.select_dtypes("object").columns
X[object_columns] = X[object_columns].astype("category")

In [18]:
import miceforest as mf

kernel = mf.ImputationKernel(data=X, datasets=5, random_state=42)
kernel.mice(verbose=True)

  warn(


Initialized logger with name mice 1-2
Dataset 0
1  | total_rev_hi_lim
2  | total_rev_hi_lim
Dataset 1
1  | total_rev_hi_lim
2  | total_rev_hi_lim
Dataset 2
1  | total_rev_hi_lim
2  | total_rev_hi_lim
Dataset 3
1  | total_rev_hi_lim
2  | total_rev_hi_lim
Dataset 4
1  | total_rev_hi_lim
2  | total_rev_hi_lim


In [19]:
kernel.transform(X).isna().any().any()

False

In [20]:
sample = X.loc[0:2]

In [21]:
kernel.transform(sample)

ValueError: No missing values to impute.

In [None]:
X["mths_since_last_major_derog"] = pd.qcut(
    X["mths_since_last_major_derog"],
    [0, 0.25, 0.5, 0.75, 1],
    labels=["1_quant", "2_quant", "3_quant", "4_quant"],
)

In [42]:
quantiles = [
    X["mths_since_last_major_derog"].quantile(q) for q in [0, 0.25, 0.5, 0.75, 1]
]

In [43]:
quantiles

[0.0, 26.0, 43.0, 60.0, 159.0]

In [40]:
t = pd.cut(
    X["mths_since_last_major_derog"],
    quantiles,
    labels=["1_quant", "2_quant", "3_quant", "4_quant"],
)
t = t.cat.add_categories(["missing"])
t.fillna("missing")

0         missing
1         missing
2         missing
3         missing
4         missing
           ...   
177074    missing
177075    missing
177076    missing
177077    missing
177078    missing
Name: mths_since_last_major_derog, Length: 177079, dtype: category
Categories (5, object): ['1_quant' < '2_quant' < '3_quant' < '4_quant' < 'missing']

In [54]:
X_train = pd.read_parquet("resources/data/splitted/X_train.parquet")
X_valid = pd.read_parquet("resources/data/splitted/X_valid.parquet")

In [58]:
set(X_train["last_credit_pull_d"].drop_duplicates().values) - set(
    X_valid["last_credit_pull_d"].drop_duplicates().values
)

{'Dec-2008',
 'Feb-2008',
 'Jan-2008',
 'Jun-2008',
 'Mar-2008',
 'May-2007',
 'May-2008',
 'Oct-2007',
 'Oct-2008',
 'Sep-2007',
 'Sep-2008'}

In [59]:
set(X_valid["last_credit_pull_d"].drop_duplicates().values) - set(
    X_train["last_credit_pull_d"].drop_duplicates().values
)

{'Jul-2007', 'Jul-2008'}