All of the code presented in this notoebook is derived from or based on the O'Reilly® book Introduction to Machine Learning with Python © 2017 by Andreas C. Muller and Sarah Guido. Please make sure you present the correct attestation to this book if you reference or utilize any of the information in this notebook. And please buy the book, it's excellent!

This notebook will focus on categorical variables converting a single feature with multiple values into multiple features, one for each value, with binary values.  It also has some debugging tips for pandas dataframes.

In [82]:
import os
import mglearn
import pandas as pd

In [83]:
employee_data_file = os.path.join(mglearn.datasets.DATA_PATH, "adult.data")
employee_data = pd.read_csv(employee_data_file, index_col=False)

In [84]:
employee_data.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [85]:
list(employee_data.columns.values)

['39',
 ' State-gov',
 ' 77516',
 ' Bachelors',
 ' 13',
 ' Never-married',
 ' Adm-clerical',
 ' Not-in-family',
 ' White',
 ' Male',
 ' 2174',
 ' 0',
 ' 40',
 ' United-States',
 ' <=50K']

In [86]:
employee_data[' State-gov'].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1297
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name:  State-gov, dtype: int64

In [87]:
employee_data[' Bachelors'].value_counts()

 HS-grad         10501
 Some-college     7291
 Bachelors        5354
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name:  Bachelors, dtype: int64

In [154]:
data = employee_data[[' State-gov',' Bachelors']].dropna()
data = data.rename(columns={' State-gov': 'State-gov', ' Bachelors': 'Bachelors'})
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [157]:
list(data.columns.values)

['State-gov', 'Bachelors']

In [158]:
data = data[data['State-gov'] != "?"]

In [159]:
data['State-gov'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1297
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: State-gov, dtype: int64

In [160]:
data = pd.get_dummies(data)

In [161]:
list(data.columns.values)

['State-gov_Federal-gov',
 'State-gov_Local-gov',
 'State-gov_Never-worked',
 'State-gov_Private',
 'State-gov_Self-emp-inc',
 'State-gov_Self-emp-not-inc',
 'State-gov_State-gov',
 'State-gov_Without-pay',
 'Bachelors_10th',
 'Bachelors_11th',
 'Bachelors_12th',
 'Bachelors_1st-4th',
 'Bachelors_5th-6th',
 'Bachelors_7th-8th',
 'Bachelors_9th',
 'Bachelors_Assoc-acdm',
 'Bachelors_Assoc-voc',
 'Bachelors_Bachelors',
 'Bachelors_Doctorate',
 'Bachelors_HS-grad',
 'Bachelors_Masters',
 'Bachelors_Preschool',
 'Bachelors_Prof-school',
 'Bachelors_Some-college']

In [141]:
data.head()

Unnamed: 0,State-gov_?,State-gov_Federal-gov,State-gov_Local-gov,State-gov_Never-worked,State-gov_Private,State-gov_Self-emp-inc,State-gov_Self-emp-not-inc,State-gov_State-gov,State-gov_Without-pay,Bachelors_10th,...,Bachelors_9th,Bachelors_Assoc-acdm,Bachelors_Assoc-voc,Bachelors_Bachelors,Bachelors_Doctorate,Bachelors_HS-grad,Bachelors_Masters,Bachelors_Preschool,Bachelors_Prof-school,Bachelors_Some-college
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
