In [1]:
# Import Data handling/display libraries
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from typing import List, Union, Dict
# Warnings will be used to silence various model warnings for tidier output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
# Import the Taiwan Credit Card Default training dataset
train = pd.read_csv('./input/Taiwan-Credit-Card-Default.csv')

# Drop the first row because it is a redundant heading row.
train.to_csv('./input/Taiwan-Credit-Card-New.csv', header=False, index=False)
# Re-read the dataset with its proper column headers
train = pd.read_csv('./input/Taiwan-Credit-Card-New.csv')
train.drop(["ID"], axis=1, inplace=True)
train.rename(columns={'default payment next month':'DEFAULT'}, inplace=True)
train.sample(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
26998,300000,1,2,1,45,-1,-1,-1,-1,-1,...,360,1440,360,360,360,360,1440,360,360,0
10042,30000,2,1,2,24,2,0,0,0,2,...,14530,14154,15320,1519,1300,1166,0,1400,0,1
15934,150000,2,2,2,25,-1,-1,-1,-1,0,...,4495,15145,525,1041,2021,4495,10650,525,0,0
8216,20000,1,2,2,30,0,0,0,2,2,...,19155,20385,19218,1307,2300,0,1525,0,1000,1
863,10000,2,2,1,31,0,0,0,0,0,...,9975,9736,8703,2330,2200,1000,333,311,322,0


In [3]:
class SelectCols(TransformerMixin):
    """Select columns from a DataFrame."""
    def __init__(self, cols: List[str]) -> None:
        self.cols = cols
    def fit(self, x: None) -> "SelectCols":
        """Nothing to do."""
        return self
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """Return just selected columns."""
        return x[self.cols]

In [4]:
class NumericEncoder(TransformerMixin):
    """Remove invalid values from numerical columns, replace with median."""
    def fit(self, x: pd.DataFrame) -> "NumericEncoder":
        """Learn median for every column in x."""
        # Find median for all columns, handling non-NaNs invalid values and NaNs
        # Where all values are NaNs (after coercion) the median value will be a NaN.
        self.encoders_ = {
            c: pd.to_numeric(x[c],
                             errors='coerce').median(skipna=True) for c in x}

        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """For each column in x, encode NaN values are learned 
        median and add a flag column indicating where these 
        replacements were made"""

        # Create a list of new DataFrames, each with 2 columns
        output_dfs = []
        for c in x:
            new_cols = pd.DataFrame()
            # Find invalid values that aren't nans (-inf, inf, string)
            invalid_idx = pd.to_numeric(x[c].replace([-np.inf, np.inf],
                                                     np.nan),
                                        errors='coerce').isnull()

            # Copy to new df for this column
            new_cols.loc[:, c] = x[c].copy()
            # Replace the invalid values with learned median
            new_cols.loc[invalid_idx, c] = self.encoders_[c]
            # Mark these replacement in a new column called 
            # "[column_name]_invalid_flag"
            new_cols.loc[:, f"{c}_invalid_flag"] = invalid_idx.astype(np.int8)

            output_dfs.append(new_cols)

        # Concat list of output_dfs to single df
        df = pd.concat(output_dfs,
                       axis=1)

        # Return wtih an remaining NaNs removed. These might exist if the median
        # is a NaN because there was no numeric data in the column at all.
        return df.fillna(0)

In [5]:
## Constructing the pipeline
# NumericEncoding fork: Select numeric columns -> numeric encode
pp_numeric_cols = Pipeline([('select', SelectCols(cols=['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE',
                                                       'PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6',
                                                       'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',
                                                       'PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6','DEFAULT'])),
                            ('process', NumericEncoder())])

In [6]:
# .fit_transform on the adult dataset
#columns = ["age","workclass","fnlwgt","education","educational-num","marital-status","occupation","relationship",
#           "race","gender","capital-gain","capital-loss","hours-per-week","native-country","income"]

train_pp = pp_numeric_cols.fit_transform(train)
print(train_pp.sample(5))
train_pp.to_csv('./input/Taiwan-Credit-Card-New.csv',index=False)

       LIMIT_BAL  LIMIT_BAL_invalid_flag  SEX  SEX_invalid_flag  EDUCATION  \
24476    50000.0                       0  1.0                 0        1.0   
232      50000.0                       0  1.0                 0        2.0   
5309    130000.0                       0  1.0                 0        2.0   
22789   240000.0                       0  2.0                 0        3.0   
6112    500000.0                       0  1.0                 0        1.0   

       EDUCATION_invalid_flag  MARRIAGE  MARRIAGE_invalid_flag   AGE  \
24476                       0       2.0                      0  34.0   
232                         0       1.0                      0  26.0   
5309                        0       2.0                      0  38.0   
22789                       0       1.0                      0  35.0   
6112                        0       2.0                      0  28.0   

       AGE_invalid_flag          ...           PAY_AMT3  \
24476                 0          ...   