In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce

In [2]:
col_names = ['age', 'work-class', 'fnlwgt', 'education', 'education-num', 'marital-status', 
            'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
            'hrs-per-week', 'native-country', 'income']

data = pd.read_csv('adult.data.csv', header = None, names = col_names, na_values = " ?")

print(data.shape)
data.head(n = 10)

(32561, 15)


Unnamed: 0,age,work-class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hrs-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [3]:
## Search for NAN values:
print(data[data.isnull().any(axis=1)].count())
print(data.isnull().values.sum())
data = data.dropna(axis = 0)
print(data.shape)

age               2399
work-class         563
fnlwgt            2399
education         2399
education-num     2399
marital-status    2399
occupation         556
relationship      2399
race              2399
sex               2399
capital-gain      2399
capital-loss      2399
hrs-per-week      2399
native-country    1816
income            2399
dtype: int64
4262
(30162, 15)


# Data Dictionary:
age: continuous.

workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.

fnlwgt: continuous.

education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
education-num: continuous.

marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.

occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.

relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.

race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.

sex: Female, Male.

capital-gain: continuous.
capital-loss: continuous.
hours-per-week: continuous.

native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.


# Handling Categorical Data

In [4]:
data.dtypes

age                int64
work-class        object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hrs-per-week       int64
native-country    object
income            object
dtype: object

In [5]:
## first the simplest ones
# Income: map >50K: 1, <=50K: 0
data['income'] = data['income'].map({' >50K': 1, ' <=50K': 0})
# data['income'].astype('int32')
# Sex: map Male: 1, Female: 0
data['sex'] = data['sex'].map({' Male': 1, ' Female': 0})
# data['sex'].astype('int32')
data.head(n = 10)

Unnamed: 0,age,work-class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hrs-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,United-States,0
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,0,0,0,16,Jamaica,0
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,45,United-States,1
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,0,14084,0,50,United-States,1
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,5178,0,40,United-States,1


In [6]:
##  Merge Never-worked & Without pay
data['work-class'] = data['work-class'].replace([' Without-pay', ' Never-worked'], 'Unpayed')
print(data['work-class'].value_counts().count())
data['work-class'].unique()

7


array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' Self-emp-inc', 'Unpayed'], dtype=object)

In [7]:
## Not many different categories so will use Label Encoding
labels = data['work-class'].astype('category').cat.categories.tolist()
mapping = {'work-class': {k: v for k, v in zip(labels, list(range(1, len(labels)+1)))}}

data.replace(mapping, inplace = True)

data.head(n = 10)

Unnamed: 0,age,work-class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hrs-per-week,native-country,income
0,39,6,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,5,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,3,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,3,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,3,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
5,37,3,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,United-States,0
6,49,3,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,0,0,0,16,Jamaica,0
7,52,5,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,45,United-States,1
8,31,3,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,0,14084,0,50,United-States,1
9,42,3,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,5178,0,40,United-States,1


In [8]:
data = data.drop(columns = ['education-num'], axis = 1)
print(data['education'].value_counts())

data.head(n = 2)

 HS-grad         9840
 Some-college    6678
 Bachelors       5044
 Masters         1627
 Assoc-voc       1307
 11th            1048
 Assoc-acdm      1008
 10th             820
 7th-8th          557
 Prof-school      542
 9th              455
 12th             377
 Doctorate        375
 5th-6th          288
 1st-4th          151
 Preschool         45
Name: education, dtype: int64


Unnamed: 0,age,work-class,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hrs-per-week,native-country,income
0,39,6,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,5,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0


In [9]:
data['education'] = data['education'].replace([' 10th', ' 11th', ' 12th'], 'HS-Student')
data['education'] = data['education'].replace([' 7th-8th', ' 9th'], 'Mid-Student')
data['education'] = data['education'].replace([' 5th-6th', ' 1st-4th'], 'Elem-Student')

print(data['education'].value_counts())

 HS-grad         9840
 Some-college    6678
 Bachelors       5044
HS-Student       2245
 Masters         1627
 Assoc-voc       1307
Mid-Student      1012
 Assoc-acdm      1008
 Prof-school      542
Elem-Student      439
 Doctorate        375
 Preschool         45
Name: education, dtype: int64


In [10]:
labels = data['education'].astype('category').cat.categories.tolist()
mapping = {'education': {k: v for k, v in zip(labels, list(range(1, len(labels)+1)))}}

data.replace(mapping, inplace = True)

data.head(n = 10)

Unnamed: 0,age,work-class,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hrs-per-week,native-country,income
0,39,6,77516,3,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,5,83311,3,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,3,215646,5,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,3,234721,11,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,3,338409,3,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
5,37,3,284582,6,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,United-States,0
6,49,3,160187,12,Married-spouse-absent,Other-service,Not-in-family,Black,0,0,0,16,Jamaica,0
7,52,5,209642,5,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,45,United-States,1
8,31,3,45781,6,Never-married,Prof-specialty,Not-in-family,White,0,14084,0,50,United-States,1
9,42,3,159449,3,Married-civ-spouse,Exec-managerial,Husband,White,1,5178,0,40,United-States,1


In [11]:
labels = data['marital-status'].astype('category').cat.categories.tolist()
mapping = {'marital-status': {k: v for k, v in zip(labels, list(range(1, len(labels)+1)))}}
data.replace(mapping, inplace = True)

labels = data['relationship'].astype('category').cat.categories.tolist()
mapping = {'relationship': {k: v for k, v in zip(labels, list(range(1, len(labels)+1)))}}
data.replace(mapping, inplace = True)

labels = data['race'].astype('category').cat.categories.tolist()
mapping = {'race': {k: v for k, v in zip(labels, list(range(1, len(labels)+1)))}}
data.replace(mapping, inplace = True)

data.head(n = 10)

Unnamed: 0,age,work-class,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hrs-per-week,native-country,income
0,39,6,77516,3,5,Adm-clerical,2,5,1,2174,0,40,United-States,0
1,50,5,83311,3,3,Exec-managerial,1,5,1,0,0,13,United-States,0
2,38,3,215646,5,1,Handlers-cleaners,2,5,1,0,0,40,United-States,0
3,53,3,234721,11,3,Handlers-cleaners,1,3,1,0,0,40,United-States,0
4,28,3,338409,3,3,Prof-specialty,6,3,0,0,0,40,Cuba,0
5,37,3,284582,6,3,Exec-managerial,6,5,0,0,0,40,United-States,0
6,49,3,160187,12,4,Other-service,2,3,0,0,0,16,Jamaica,0
7,52,5,209642,5,3,Exec-managerial,1,5,1,0,0,45,United-States,1
8,31,3,45781,6,5,Prof-specialty,2,5,0,14084,0,50,United-States,1
9,42,3,159449,3,3,Exec-managerial,1,5,1,5178,0,40,United-States,1


In [12]:
## Occupation & Nativity have many categories > Binary Encode
encoder = ce.BinaryEncoder(cols = ['occupation', 'native-country'])
data = encoder.fit_transform(data)

data.head(n = 10)

Unnamed: 0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,native-country_0,native-country_1,native-country_2,native-country_3,native-country_4,...,fnlwgt,education,marital-status,relationship,race,sex,capital-gain,capital-loss,hrs-per-week,income
0,0,0,0,0,1,0,0,0,0,0,...,77516,3,5,2,5,1,2174,0,40,0
1,0,0,0,1,0,0,0,0,0,0,...,83311,3,3,1,5,1,0,0,13,0
2,0,0,0,1,1,0,0,0,0,0,...,215646,5,1,2,5,1,0,0,40,0
3,0,0,0,1,1,0,0,0,0,0,...,234721,11,3,1,3,1,0,0,40,0
4,0,0,1,0,0,0,0,0,0,0,...,338409,3,3,6,3,0,0,0,40,0
5,0,0,0,1,0,0,0,0,0,0,...,284582,6,3,6,5,0,0,0,40,0
6,0,0,1,0,1,0,0,0,0,0,...,160187,12,4,2,3,0,0,0,16,0
7,0,0,0,1,0,0,0,0,0,0,...,209642,5,3,1,5,1,0,0,45,1
8,0,0,1,0,0,0,0,0,0,0,...,45781,6,5,2,5,0,14084,0,50,1
9,0,0,0,1,0,0,0,0,0,0,...,159449,3,3,1,5,1,5178,0,40,1


# The Model

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [14]:
labels = data['income']
features = data.drop(columns = ['income'], axis = 1)

labels.head(n = 5)
features.head(n = 5)

Unnamed: 0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,native-country_0,native-country_1,native-country_2,native-country_3,native-country_4,...,work-class,fnlwgt,education,marital-status,relationship,race,sex,capital-gain,capital-loss,hrs-per-week
0,0,0,0,0,1,0,0,0,0,0,...,6,77516,3,5,2,5,1,2174,0,40
1,0,0,0,1,0,0,0,0,0,0,...,5,83311,3,3,1,5,1,0,0,13
2,0,0,0,1,1,0,0,0,0,0,...,3,215646,5,1,2,5,1,0,0,40
3,0,0,0,1,1,0,0,0,0,0,...,3,234721,11,3,1,3,1,0,0,40
4,0,0,1,0,0,0,0,0,0,0,...,3,338409,3,3,6,3,0,0,0,40


In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.25, random_state = 2)
print(X_train.shape)
print(X_test.shape)

(22621, 23)
(7541, 23)


In [16]:
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [17]:
y_hat_train = model.predict(X_train)
train_pred = [round(value) for value in y_hat_train]

train_accuracy = accuracy_score(y_train, train_pred)
print('Train Accuracy: ', train_accuracy)

Train Accuracy:  0.8629149904955572


  if diff:


In [18]:
y_hat_test = model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_hat_test)
print('Test Accuracy: ', test_accuracy)

Test Accuracy:  0.8630155151836626


  if diff:
