In [1]:
## This data prep script uses Gareth's transforms to numeric

# Import Data handling/display libraries
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from typing import List, Union, Dict
# Warnings will be used to silence various model warnings for tidier output
import warnings
warnings.filterwarnings('ignore')
# Import the Adult training dataset
train = pd.read_csv('./input/adult.csv')
train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [2]:
#Clean the data
# Replace '?' in relevant columns with the most frequent values in the column.
attrib, counts = np.unique(train['Employment'], return_counts = True)
most_freq_attrib = attrib[np.argmax(counts, axis = 0)]
train['Employment'][train['Employment'] == '?'] = most_freq_attrib 

attrib, counts = np.unique(train['Occupation'], return_counts = True)
most_freq_attrib = attrib[np.argmax(counts, axis = 0)]
train['Occupation'][train['Occupation'] == '?'] = most_freq_attrib 

attrib, counts = np.unique(train['NativeCountry'], return_counts = True)
most_freq_attrib = attrib[np.argmax(counts, axis = 0)]
train['NativeCountry'][train['NativeCountry'] == '?'] = most_freq_attrib 

train['Income']=train['Income'].map({'<=50K': 1, '>50K': 0, '<=50K.': 1, '>50K.': 0})

KeyError: 'Employment'

In [None]:
class SelectCols(TransformerMixin):
    """Select columns from a DataFrame."""
    def __init__(self, cols: List[str]) -> None:
        self.cols = cols
    def fit(self, x: None) -> "SelectCols":
        """Nothing to do."""
        return self
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """Return just selected columns."""
        return x[self.cols]

In [None]:
class LabelEncoder(TransformerMixin):
    """Convert non-numeric columns to numeric using label encoding. 
    Handles unseen data on transform."""
    def fit(self, x: pd.DataFrame) -> "LabelEncoder":
        """Learn encoder for each column."""
        encoders = {}
        for c in x:
            # Make encoder using pd.factorize on unique values, 
            # then convert to a dictionary
            v, k = zip(pd.factorize(x[c].unique()))
            encoders[c] = dict(zip(k[0], v[0]))
        self.encoders_ = encoders
        return self

    def transform(self, x) -> pd.DataFrame:
        """For columns in x that have learned encoders, apply encoding."""
        x = x.copy()
        for c in x:
            # Ignore new, unseen values
            x.loc[~x[c].isin(self.encoders_[c]), c] = np.nan
            # Map learned labels
            x.loc[:, c] = x[c].map(self.encoders_[c])
        # Return without nans
        return x.fillna(-2).astype(int)

In [None]:
class NumericEncoder(TransformerMixin):
    """Remove invalid values from numerical columns, replace with median."""
    def fit(self, x: pd.DataFrame) -> "NumericEncoder":
        """Learn median for every column in x."""
        # Find median for all columns, handling non-NaNs invalid values and NaNs
        # Where all values are NaNs (after coercion) the median value will be a NaN.
        self.encoders_ = {
            c: pd.to_numeric(x[c], errors='coerce').median(skipna=True) for c in x}
        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """For each column in x, encode NaN values are learned 
        median and add a flag column indicating where these 
        replacements were made"""

        # Create a list of new DataFrames, each with 2 columns
        output_dfs = []
        for c in x:
            new_cols = pd.DataFrame()
            # Find invalid values that aren't nans (-inf, inf, string)
            invalid_idx = pd.to_numeric(x[c].replace([-np.inf, np.inf], np.nan),
                                        errors='coerce').isnull()
            # Copy to new df for this column
            new_cols.loc[:, c] = x[c].copy()
            # Replace the invalid values with learned median
            new_cols.loc[invalid_idx, c] = self.encoders_[c]
            # Mark these replacement in a new column called 
            # "[column_name]_invalid_flag"
            # new_cols.loc[:, f"{c}_invalid_flag"] = invalid_idx.astype(np.int8)

            output_dfs.append(new_cols)

        # Concat list of output_dfs to single df
        df = pd.concat(output_dfs,
                       axis=1)

        # Return wtih an remaining NaNs removed. These might exist if the median
        # is a NaN because there was no numeric data in the column at all.
        return df.fillna(0)

In [None]:
## Constructing the pipeline

# LabelEncoding fork: Select object columns -> label encode
pp_object_cols = Pipeline([('select', SelectCols(cols=['Employment', 'Education', 
                                                       'MaritalStatus', 'Occupation', 
                                                       'Relationship','Race','Gender',
                                                       'NativeCountry'])),
                           ('process', LabelEncoder())])

# NumericEncoding fork: Select numeric columns -> numeric encode
pp_numeric_cols = Pipeline([('select', SelectCols(cols=['Age','Fnlwgt','EducationNum','CapitalLoss','HoursPerWeek', 
                                                        'CapitalGain','Income'])),
                            ('process', NumericEncoder())])

In [None]:
# .fit_transform on the adult dataset
all_columns = ["Age","Employment","Fnlwgt","Education","EducationalNum","MaritalStatus","Occupation","Relationship",
           "Race","Gender","CapitalGain","CapitalLoss","HoursPerWeek","NativeCountry","Income"]

train_pp = pd.concat((pp_numeric_cols.fit_transform(train), 
                      pp_object_cols.fit_transform(train)),
                      axis=1)

In [None]:
print('this is employment', train_pp.Employment.unique())
print('this is education', train_pp.Education.unique())
print('this is occupation', train_pp.Occupation.unique())
print('this is relationship', train_pp.Relationship.unique())
print('this is race', train_pp.Race.unique())
print('this is gender', train_pp.Gender.unique())
print('this is income', train_pp.Income.unique())
#train_pp.head(10)

In [None]:
train_pp.Gender.value_counts()

In [None]:
gender = train_pp.groupby(['Gender', 'Income']).size().unstack(1)
gender

In [None]:
train_pp.head(5)

In [None]:
train_pp.to_csv('./input/adult-cleaned.csv', index=False)