In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Fetch the data and load it in pandas

In [3]:
import os
from urllib.request import urlretrieve

url = ("https://archive.ics.uci.edu/ml/machine-learning-databases"
       "/adult/adult.data")
local_filename = os.path.basename(url)
if not os.path.exists(local_filename):
    print("Downloading Adult Census datasets from UCI")
    urlretrieve(url, local_filename)

In [4]:
names = ("age, workclass, fnlwgt, education, education-num, "
         "marital-status, occupation, relationship, race, sex, "
         "capital-gain, capital-loss, hours-per-week, "
         "native-country, income").split(', ')    
data = pd.read_csv(local_filename, names=names).drop('fnlwgt', axis=1)

In [5]:
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [48]:
target = (data['income'].values == ' >50K').astype(np.int32)
features = data.drop('income', axis=1)

In [49]:
from sklearn.model_selection import train_test_split


features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=0)

## One-hot Encoding

In [71]:
features_train

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
15282,36,Private,11th,7,Divorced,Transport-moving,Not-in-family,White,Male,0,0,40,United-States
24870,35,Private,HS-grad,9,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,54,United-States
18822,38,Private,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,50,United-States
26404,50,Private,HS-grad,9,Married-civ-spouse,Sales,Wife,Black,Female,0,0,40,United-States
7842,68,Self-emp-not-inc,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,2149,40,United-States
4890,51,Self-emp-not-inc,Some-college,10,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,75,United-States
3243,25,Private,9th,5,Never-married,Machine-op-inspct,Other-relative,White,Male,0,0,48,Mexico
17470,19,?,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States
14211,34,Self-emp-not-inc,Assoc-voc,11,Never-married,Other-service,Other-relative,White,Female,0,0,40,United-States
22453,24,Private,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States


In [70]:
dummies_train = pd.get_dummies(features_train)
dummies_train

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
15282,36,7,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
24870,35,9,0,0,54,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
18822,38,13,0,0,50,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
26404,50,9,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
7842,68,11,0,2149,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4890,51,10,0,0,75,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3243,25,5,0,0,48,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
17470,19,10,0,0,30,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14211,34,11,0,0,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
22453,24,10,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [66]:
dummies_test = pd.get_dummies(features_test)

In [68]:
dummies_train.columns.size

107

In [69]:
dummies_test.columns.size

106

## The scikit-learn transformer API

In [59]:
from sklearn.base import BaseEstimator, TransformerMixin


class OneHotEncoder(BaseEstimator, TransformerMixin):
    """One-Hot encode categorical variables of a pandas DataFrame
    
    Note that sklearn.preprocessing.OneHotEncoder only works on
    numpy arrays with integer valued categorical features.
    """

    def fit(self, X, y=None):
        self.fit_transform(X, y)
        return self

    def fit_transform(self, X, y=None):
        """Encode X and store keep encoding data for later transform"""
        encoded = pd.get_dummies(X)
        self.column_names_ = encoded.columns
        return encoded

    def transform(self, X):
        encoded = pd.get_dummies(X)
        # Add missing (unobserved) columns (with zeros) and remove
        # columns not observed at fit time:
        encoded = encoded.reindex(columns=self.column_names_, fill_value=0)
        return encoded

In [60]:
oh_encoder = OneHotEncoder()
oh_train = oh_encoder.fit_transform(features_train, target_train)
oh_test = oh_encoder.transform(features_test)

In [61]:
oh_train.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
15282,36,7,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
24870,35,9,0,0,54,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
18822,38,13,0,0,50,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
26404,50,9,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
7842,68,11,0,2149,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [62]:
oh_test.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
count,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,...,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0,6513.0
mean,38.590204,10.133733,1056.367265,85.837556,40.484416,0.058038,0.029633,0.063872,0.000154,0.692001,...,0.000768,0.003685,0.000307,0.00215,0.001382,0.000461,0.000614,0.891294,0.002457,0.000461
std,13.690312,2.577202,7231.944668,395.524986,12.70167,0.233833,0.169586,0.244544,0.012391,0.461702,...,0.027699,0.060596,0.017522,0.046317,0.03715,0.021459,0.024776,0.311293,0.049507,0.021459
min,17.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,37.0,10.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,48.0,13.0,0.0,0.0,45.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,90.0,16.0,99999.0,2824.0,99.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [74]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

pipeline = make_pipeline(
    OneHotEncoder(),
    DecisionTreeClassifier(max_depth=8),
)

scores = cross_val_score(pipeline, features_train, target_train, cv=5,
                         scoring='roc_auc')
print("ROC AUC: {:.4f} +/-{:.4f}".format(
    np.mean(scores), np.std(scores)))

ROC AUC: 0.9007 +/-0.0018


**Exercises**

- Write an encoder that dummy encodes low cardinality categorical variables and use target sorted integer encoding for the high cardinality variables
- Write a pipeline that combines these two encoders as preprocessing steps
- Use scikit-optimize to find the best feature encoding strategy while tuning some classifier parameters at the same time: see the following examples for help:

https://scikit-optimize.github.io/notebooks/sklearn-gridsearchcv-replacement.html

- Advanced: checkout other encoding schems, for example in http://contrib.scikit-learn.org/categorical-encoding/