In [1]:
# Import Data handling/display libraries
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from typing import List, Union, Dict
# Warnings will be used to silence various model warnings for tidier output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('ignore')
# Import the Adult training dataset
train = pd.read_csv('./input/adult.csv')

In [2]:
#Clean the data
# Replace '?' in relevant columns
attrib, counts = np.unique(train['workclass'], return_counts = True)
most_freq_attrib = attrib[np.argmax(counts, axis = 0)]
train['workclass'][train['workclass'] == '?'] = most_freq_attrib 

attrib, counts = np.unique(train['occupation'], return_counts = True)
most_freq_attrib = attrib[np.argmax(counts, axis = 0)]
train['occupation'][train['occupation'] == '?'] = most_freq_attrib 

attrib, counts = np.unique(train['native-country'], return_counts = True)
most_freq_attrib = attrib[np.argmax(counts, axis = 0)]
train['native-country'][train['native-country'] == '?'] = most_freq_attrib 

train['income']=train['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
train.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
45314,22,Private,171176,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,48,United-States,0
41137,45,Private,246392,HS-grad,9,Never-married,Priv-house-serv,Unmarried,Black,Female,0,0,30,United-States,0
39427,28,Private,36601,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,35,United-States,0
15232,75,Private,185603,10th,6,Widowed,Tech-support,Not-in-family,White,Female,0,0,32,United-States,0
11813,22,Self-emp-not-inc,47541,Some-college,10,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,0


In [3]:
class SelectCols(TransformerMixin):
    """Select columns from a DataFrame."""
    def __init__(self, cols: List[str]) -> None:
        self.cols = cols
    def fit(self, x: None) -> "SelectCols":
        """Nothing to do."""
        return self
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """Return just selected columns."""
        return x[self.cols]

In [4]:
class LabelEncoder(TransformerMixin):
    """Convert non-numeric columns to numeric using label encoding. 
    Handles unseen data on transform."""
    def fit(self, x: pd.DataFrame) -> "LabelEncoder":
        """Learn encoder for each column."""
        encoders = {}
        for c in x:
            # Make encoder using pd.factorize on unique values, 
            # then convert to a dictionary
            v, k = zip(pd.factorize(x[c].unique()))
            encoders[c] = dict(zip(k[0], v[0]))
        self.encoders_ = encoders
        return self

    def transform(self, x) -> pd.DataFrame:
        """For columns in x that have learned encoders, apply encoding."""
        x = x.copy()
        for c in x:
            # Ignore new, unseen values
            x.loc[~x[c].isin(self.encoders_[c]), c] = np.nan
            # Map learned labels
            x.loc[:, c] = x[c].map(self.encoders_[c])
        # Return without nans
        return x.fillna(-2).astype(int)

In [5]:
class NumericEncoder(TransformerMixin):
    """Remove invalid values from numerical columns, replace with median."""
    def fit(self, x: pd.DataFrame) -> "NumericEncoder":
        """Learn median for every column in x."""
        # Find median for all columns, handling non-NaNs invalid values and NaNs
        # Where all values are NaNs (after coercion) the median value will be a NaN.
        self.encoders_ = {
            c: pd.to_numeric(x[c],
                             errors='coerce').median(skipna=True) for c in x}

        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """For each column in x, encode NaN values are learned 
        median and add a flag column indicating where these 
        replacements were made"""

        # Create a list of new DataFrames, each with 2 columns
        output_dfs = []
        for c in x:
            new_cols = pd.DataFrame()
            # Find invalid values that aren't nans (-inf, inf, string)
            invalid_idx = pd.to_numeric(x[c].replace([-np.inf, np.inf],
                                                     np.nan),
                                        errors='coerce').isnull()

            # Copy to new df for this column
            new_cols.loc[:, c] = x[c].copy()
            # Replace the invalid values with learned median
            new_cols.loc[invalid_idx, c] = self.encoders_[c]
            # Mark these replacement in a new column called 
            # "[column_name]_invalid_flag"
            new_cols.loc[:, f"{c}_invalid_flag"] = invalid_idx.astype(np.int8)

            output_dfs.append(new_cols)

        # Concat list of output_dfs to single df
        df = pd.concat(output_dfs,
                       axis=1)

        # Return wtih an remaining NaNs removed. These might exist if the median
        # is a NaN because there was no numeric data in the column at all.
        return df.fillna(0)

In [6]:
## Constructing the pipeline

# LabelEncoding fork: Select object columns -> label encode
pp_object_cols = Pipeline([('select', SelectCols(cols=['workclass', 'education', 
                                                       'marital-status', 'occupation', 
                                                       'relationship','race','gender',
                                                       'native-country'])),
                           ('process', LabelEncoder())])

# NumericEncoding fork: Select numeric columns -> numeric encode
pp_numeric_cols = Pipeline([('select', SelectCols(cols=['age','fnlwgt','educational-num','capital-loss','hours-per-week', 
                                                        'capital-gain','income'])),
                            ('process', NumericEncoder())])

In [8]:
# .fit_transform on the adult dataset
#columns = ["age","workclass","fnlwgt","education","educational-num","marital-status","occupation","relationship",
#           "race","gender","capital-gain","capital-loss","hours-per-week","native-country","income"]

train_pp = pd.concat((pp_numeric_cols.fit_transform(train), 
                      pp_object_cols.fit_transform(train)),
                      axis=1)
print(train_pp.sample(5))
train_pp.to_csv('./input/adult-clean.csv', index=False)

        age  age_invalid_flag    fnlwgt  fnlwgt_invalid_flag  educational-num  \
15286  37.0                 0  145064.0                    0             12.0   
13446  37.0                 0  454024.0                    0              9.0   
43516  44.0                 0  192225.0                    0             13.0   
28192  42.0                 0  115178.0                    0             13.0   
36153  51.0                 0   70767.0                    0              9.0   

       educational-num_invalid_flag  capital-loss  capital-loss_invalid_flag  \
15286                             0           0.0                          0   
13446                             0           0.0                          0   
43516                             0           0.0                          0   
28192                             0           0.0                          0   
36153                             0           0.0                          0   

       hours-per-week  hours-per