# Decision Tree Modelling: Predicting Incomes Using Social Factors

## Data Exploration

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
columns = ['age','workclass', 'fnlwgt', 'education', 'education_num',
           'marital_status', 'occupation', 'relationship', 'race', 'sex',
           'capital_gain', 'capital_loss', 'hours_per_week',
           'native_country', 'high_income']
data = pd.read_csv('adult.data',names = columns)

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
high_income       32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Data Cleaning
Missing values are filled with `?`

In [5]:
null_vals = dict()
for col in data.select_dtypes(include='object').columns:
    null_vals[col] = data[col].str.contains('\?').sum()

null_vals

{'workclass': 1836,
 'education': 0,
 'marital_status': 0,
 'occupation': 1843,
 'relationship': 0,
 'race': 0,
 'sex': 0,
 'native_country': 583,
 'high_income': 0}

In [6]:
cols_missing_vals = [key for key,value in null_vals.items() if value > 0]
cols_missing_vals

['workclass', 'occupation', 'native_country']

In [7]:
for col in cols_missing_vals:  
    print(data[col].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64
 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                    

In [8]:
data[(
    data.workclass.str.contains('\?')) & 
    (data.occupation.str.contains('\?') &
    (data.native_country.str.contains('\?')
    ))].shape[0] #rows missing the 3 values

27

For now, we will impute all the missing values with the mode. It could be an option to consider eliminating the 27 rows with the 3 missing values.

In [9]:
for col in cols_missing_vals:
    data[col] = data[col].str.replace('\?',data[col].mode()[0]).str.strip()
    print(data[col].value_counts())

Private             24532
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64
Prof-specialty       5983
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64
United-States                 29753
Mexico                          643
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica

In [10]:
# Convert object cols (string) to numeric values
for col in data.select_dtypes(include='object').columns:
    data[col] = pd.Categorical(data[col]).codes
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


## Defining our Decision Tree Classifier

### Splitting data in train/test

In [11]:
np.random.seed(1)

data = data.reindex(np.random.permutation(data.index)) #reindex data randomly

split_factor = 0.8 # train/test data ratio, 80% of rows will belong to train
split_row = math.floor(data.shape[0] * split_factor)

train = data.iloc[:split_row]
test = data.iloc[split_row:]

### Predictions and Model Accuracy

The higher the AUC score, the more accurate our model is. The AUC score ranges from 0 to 1.

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


features = ["age", "workclass", "education_num", "marital_status", 
              "occupation", "relationship", "race", "sex", 
              "hours_per_week", "native_country"]
for n_split in range(2,15):
    clf = DecisionTreeClassifier(
        random_state=1,min_samples_split = n_split,
        max_depth = 7) # find best value
    clf.fit(train[features],train.high_income)
    predictions = clf.predict(test[features])

    error = roc_auc_score(y_true = test.high_income, y_score = predictions)
    print('min_samples_split = {}'.format(n_split))
    print('AUC Score = {:.2f}'.format(error))

min_samples_split = 2
AUC Score = 0.75
min_samples_split = 3
AUC Score = 0.75
min_samples_split = 4
AUC Score = 0.75
min_samples_split = 5
AUC Score = 0.75
min_samples_split = 6
AUC Score = 0.75
min_samples_split = 7
AUC Score = 0.75
min_samples_split = 8
AUC Score = 0.75
min_samples_split = 9
AUC Score = 0.75
min_samples_split = 10
AUC Score = 0.75
min_samples_split = 11
AUC Score = 0.75
min_samples_split = 12
AUC Score = 0.75
min_samples_split = 13
AUC Score = 0.75
min_samples_split = 14
AUC Score = 0.75


To-do - find optimal max depth above. create df or dict with both hyperparameters