In [1]:
# Import Data handling/display libraries
# Import Data handling/display libraries
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from typing import List, Union, Dict
# Warnings will be used to silence various model warnings for tidier output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
# Import the Taiwan Credit Card Default training dataset
train = pd.read_csv('./input/Taiwan-Credit-Card-Default.csv')

# Drop the first row because it is a redundant heading row.
train.to_csv('./input/Taiwan-Credit-Card-New.csv', header=False, index=False)
# Re-read the dataset with its proper column headers
train = pd.read_csv('./input/Taiwan-Credit-Card-New.csv')
train.drop(["ID"], axis=1, inplace=True)
train.rename(columns={'default payment next month':'DEFAULT'}, inplace=True)
train.sample(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
25087,550000,1,1,1,53,0,0,0,0,0,...,214030,218096,210264,7110,7353,7534,7385,7253,15000,0
13771,100000,2,2,2,23,-2,-2,-2,-1,-1,...,4370,78453,80063,3800,7,971,79681,2861,2698,1
29529,50000,1,2,1,42,1,-1,-1,-1,0,...,4300,2800,3000,3095,5855,4300,2500,3000,500,0
7241,60000,2,1,2,24,0,0,0,0,0,...,34986,35770,36622,2000,1800,1500,1500,1600,1500,0
25031,90000,1,2,1,58,0,0,0,0,0,...,50780,50929,48735,3600,5100,2200,2200,1944,2200,0


In [3]:
class SelectCols(TransformerMixin):
    """Select columns from a DataFrame."""
    def __init__(self, cols: List[str]) -> None:
        self.cols = cols
    def fit(self, x: None) -> "SelectCols":
        """Nothing to do."""
        return self
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """Return just selected columns."""
        return x[self.cols]

In [4]:
class NumericEncoder(TransformerMixin):
    """Remove invalid values from numerical columns, replace with median."""
    def fit(self, x: pd.DataFrame) -> "NumericEncoder":
        """Learn median for every column in x."""
        # Find median for all columns, handling non-NaNs invalid values and NaNs
        # Where all values are NaNs (after coercion) the median value will be a NaN.
        self.encoders_ = {
            c: pd.to_numeric(x[c],
                             errors='coerce').median(skipna=True) for c in x}

        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """For each column in x, encode NaN values are learned 
        median and add a flag column indicating where these 
        replacements were made"""

        # Create a list of new DataFrames, each with 2 columns
        output_dfs = []
        for c in x:
            new_cols = pd.DataFrame()
            # Find invalid values that aren't nans (-inf, inf, string)
            invalid_idx = pd.to_numeric(x[c].replace([-np.inf, np.inf],
                                                     np.nan),
                                        errors='coerce').isnull()

            # Copy to new df for this column
            new_cols.loc[:, c] = x[c].copy()
            # Replace the invalid values with learned median
            new_cols.loc[invalid_idx, c] = self.encoders_[c]
            # Mark these replacement in a new column called 
            # "[column_name]_invalid_flag"
            new_cols.loc[:, f"{c}_invalid_flag"] = invalid_idx.astype(np.int8)

            output_dfs.append(new_cols)

        # Concat list of output_dfs to single df
        df = pd.concat(output_dfs,
                       axis=1)

        # Return wtih an remaining NaNs removed. These might exist if the median
        # is a NaN because there was no numeric data in the column at all.
        return df.fillna(0)

In [5]:
## Constructing the pipeline
# NumericEncoding fork: Select numeric columns -> numeric encode
pp_numeric_cols = Pipeline([('select', SelectCols(cols=['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE',
                                                       'PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6',
                                                       'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',
                                                       'PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6','DEFAULT'])),
                            ('process', NumericEncoder())])

In [6]:
# .fit_transform on the adult dataset
#columns = ["age","workclass","fnlwgt","education","educational-num","marital-status","occupation","relationship",
#           "race","gender","capital-gain","capital-loss","hours-per-week","native-country","income"]

train_pp = pp_numeric_cols.fit_transform(train)
print(train_pp.sample(5))
train_pp.to_csv('./input/Taiwan-Credit-Card-New.csv',index=False)

       LIMIT_BAL  LIMIT_BAL_invalid_flag  SEX  SEX_invalid_flag  EDUCATION  \
12074    70000.0                       0  1.0                 0        2.0   
880     200000.0                       0  1.0                 0        1.0   
552      50000.0                       0  1.0                 0        1.0   
714     400000.0                       0  2.0                 0        1.0   
23152   240000.0                       0  2.0                 0        1.0   

       EDUCATION_invalid_flag  MARRIAGE  MARRIAGE_invalid_flag   AGE  \
12074                       0       1.0                      0  39.0   
880                         0       2.0                      0  30.0   
552                         0       1.0                      0  56.0   
714                         0       2.0                      0  31.0   
23152                       0       2.0                      0  34.0   

       AGE_invalid_flag          ...           PAY_AMT3  \
12074                 0          ...   