# Assignment is below at the end

- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [2]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [3]:
golden = pd.read_csv('../data/adult.test', index_col=False)

In [4]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [7]:
from sklearn import preprocessing

In [8]:
# Columns we want to transform
transform_columns = ['sex']

#Columns we can't use because non-numerical
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

## First let's try using `pandas.get_dummies()` to transform columns

In [9]:
dummies = pd.get_dummies(df[transform_columns])
dummies

Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0
...,...,...
32556,1,0
32557,0,1
32558,1,0
32559,0,1


In [10]:
dummies.shape

(32561, 2)

## sklearn has a similar process for OneHot Encoding features

In [11]:
onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False)
onehot.fit(df[transform_columns])



In [12]:
onehot.categories_

[array([' Female', ' Male'], dtype=object)]

In [13]:
sex = onehot.transform(df[transform_columns])
sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [14]:
sex.shape

(32561, 2)

## In addition to OneHot encoding there is Ordinal Encoding 

In [15]:
enc = preprocessing.OrdinalEncoder()
enc.fit(df[["salary"]])
salary = enc.transform(df[["salary"]])
salary

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [16]:
enc.categories_[0]

array([' <=50K', ' >50K'], dtype=object)

In [17]:
x = df.copy()

# transformed = pd.get_dummies(df[transform_columns])


onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False).fit(df[transform_columns])

enc = preprocessing.OrdinalEncoder()

enc.fit(df[["salary"]])


transformed = onehot.transform(df[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.DataFrame(transformed, columns=new_cols)


x = pd.concat(
    [
        x.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)


x["salary"] = enc.transform(df[["salary"]])



In [18]:
x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,Female,Male
0,39,77516,13,2174,0,40,0.0,0.0,1.0
1,50,83311,13,0,0,13,0.0,0.0,1.0
2,38,215646,9,0,0,40,0.0,0.0,1.0
3,53,234721,7,0,0,40,0.0,0.0,1.0
4,28,338409,13,0,0,40,0.0,1.0,0.0


In [19]:
xt = golden.copy()

transformed = onehot.transform(xt[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.DataFrame(transformed, columns=new_cols)

x = pd.concat(
    [
        xt.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)

xt["salary"] = enc.fit_transform(golden[["salary"]])

In [20]:
xt.salary.value_counts()

0.0    12435
1.0     3846
Name: salary, dtype: int64

In [21]:
enc.categories_

[array([' <=50K.', ' >50K.'], dtype=object)]

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#### Choose the model of your preference: DecisionTree or RandomForest

In [23]:
model = RandomForestClassifier(criterion='entropy')

In [24]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [25]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

In [26]:
model.tree_.node_count

5007

In [27]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3343276915304919),
 ('education-num', 0.16185444800436688),
 ('capital-gain', 0.21032794354777545),
 ('capital-loss', 0.07295681480450657),
 ('hours-per-week', 0.16985108154415818),
 (' Female', 0.018131752065691266),
 (' Male', 0.03255026850300953)]

In [28]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3343276915304919),
 ('education-num', 0.16185444800436688),
 ('capital-gain', 0.21032794354777545),
 ('capital-loss', 0.07295681480450657),
 ('hours-per-week', 0.16985108154415818),
 (' Female', 0.018131752065691266),
 (' Male', 0.03255026850300953)]

In [29]:
x.drop(['fnlwgt','salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,Female,Male
0,25,7,0,0,40,0.0,1.0
1,38,9,0,0,50,0.0,1.0
2,28,12,0,0,40,0.0,1.0
3,44,10,7688,0,40,0.0,1.0
4,18,10,0,0,30,1.0,0.0


In [30]:
set(x.columns) - set(xt.columns)

{' Female', ' Male'}

In [31]:
list(x.drop('salary', axis=1).columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 ' Female',
 ' Male']

In [68]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- education
- marital-status
- native-country
- occupation
- race
- ...
Feature names seen at fit time, yet now missing:
-  Female
-  Male


In [77]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [None]:
accuracy_score(xt.salary, predictions)

In [33]:
accuracy_score(xt.salary, predictions)

NameError: name 'accuracy_score' is not defined

In [34]:
confusion_matrix(xt.salary, predictions)

NameError: name 'confusion_matrix' is not defined

In [35]:
print(classification_report(xt.salary, predictions))

NameError: name 'classification_report' is not defined

In [36]:
print(classification_report(xt.salary, predictions))

NameError: name 'classification_report' is not defined

In [37]:
accuracy_score(x.salary, predictionsx)

NameError: name 'accuracy_score' is not defined

In [38]:
confusion_matrix(x.salary, predictionsx)

NameError: name 'confusion_matrix' is not defined

In [39]:
print(classification_report(x.salary, predictionsx))

NameError: name 'classification_report' is not defined

In [40]:
print(classification_report(x.salary, predictionsx))

NameError: name 'classification_report' is not defined

# For the following use the above `adult` dataset. 

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and calculate `precision`, `recall`, `f1`, `confusion matrix` on golden-test set. Start with only numerical features/columns. (age, education-num, capital-gain, capital-loss, hours-per-week) 

In [41]:
def salary_over_under(s):
    if s == ' <=50K.':
        return ' <=50K'
    elif s == ' >50K.':
        return ' >50K'

In [78]:
model = RandomForestClassifier(criterion='entropy')
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [42]:
# Using Golden DF & Test 
golden_ts = golden.copy()
golden_ts.head()
df_numeric = df[['salary','age','education-num','capital-gain','capital-loss','hours-per-week']]
golden_numeric = golden[['salary','age','education-num','capital-gain','capital-loss','hours-per-week']]
df_numeric['salary']
golden_numeric['salary']

0         <=50K.
1         <=50K.
2          >50K.
3          >50K.
4         <=50K.
          ...   
16276     <=50K.
16277     <=50K.
16278     <=50K.
16279     <=50K.
16280      >50K.
Name: salary, Length: 16281, dtype: object

In [43]:
golden_numeric['salary'] = golden_numeric['salary'].apply(salary_over_under)
golden_numeric['salary']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  golden_numeric['salary'] = golden_numeric['salary'].apply(salary_over_under)


0         <=50K
1         <=50K
2          >50K
3          >50K
4         <=50K
          ...  
16276     <=50K
16277     <=50K
16278     <=50K
16279     <=50K
16280      >50K
Name: salary, Length: 16281, dtype: object

In [44]:
# Random Forest 
randomforestmodel = RandomForestClassifier(criterion='entropy')
# Constructing off of DF
randomforestmodel.fit(df_numeric.drop(['salary'], axis=1), df_numeric.salary) 

In [97]:
list(zip(df_numeric.drop(['salary'], axis=1).columns, decisiontreemodel.feature_importances_))

[('age', 0.31635615584594295),
 ('education-num', 0.1253420692389576),
 ('capital-gain', 0.21296995497529878),
 ('capital-loss', 0.08111318372689252),
 ('hours-per-week', 0.14515856182924208)]

In [None]:
gn_rf_predictions = randomforestmodel.predict(golden_numeric.drop(['salary'], axis=1))
gn_rf_predictions
golden_numeric.salary

In [None]:
# Precision, Recall, and F1, 
print(classification_report(golden_numeric.salary, gn_rf_predictions))
# Confusion Matrix
print(confusion_matrix(golden_numeric.salary, gn_rf_predictions))

In [72]:
# Decision Tree 
decisiontreemodel = DecisionTreeClassifier(criterion='entropy', max_depth=None)
decisiontreemodel.fit(df_numeric.drop(['salary'], axis=1), df_numeric.salary)


In [73]:
decisiontreemodel.tree_.node_count

7475

In [74]:
list(zip(golden_numeric.drop(['salary'], axis=1).columns, decisiontreemodel.feature_importances_))

[('age', 0.32735661080544304),
 ('education-num', 0.1665411236366068),
 ('capital-gain', 0.24685739631071166),
 ('capital-loss', 0.09811226499316932),
 ('hours-per-week', 0.1611326042540692)]

In [75]:
gn_dt_predictions = decisiontreemodel.predict(golden_numeric.drop(['salary'], axis=1))
gn_dt_predictions

array([' <=50K', ' <=50K', ' <=50K', ..., ' >50K', ' >50K', ' <=50K'],
      dtype=object)

In [79]:
# Precision, Recall, and F1, 
print(classification_report(golden_numeric.salary, gn_dt_predictions))
# Confusion Matrix
print(confusion_matrix(golden_numeric.salary, gn_dt_predictions))

              precision    recall  f1-score   support

       <=50K       0.85      0.93      0.89     12435
        >50K       0.66      0.46      0.54      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.69      0.71     16281
weighted avg       0.80      0.82      0.80     16281

[[11513   922]
 [ 2070  1776]]


In [80]:
# Based on the above statistics on prediciso, recall, f1 and the confusion matrices, the RandomForest has a slightly better performance.

# 2. Use a RandomForest or DecisionTree and the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Using the golden-test set show [`precision`, `recall`, `f1`, `confusion matrix`] for each additional feature added.

In [81]:
df_nonnumeric = df.drop(['salary','age','education-num','capital-gain','capital-loss','hours-per-week'], axis = 1)
df_nonnumeric

Unnamed: 0,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...,...
32556,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,151910,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,201490,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [82]:
import pandas as pd
df_nonnumeric['workclass']
workclass_cats = pd.unique(df_nonnumeric['workclass'])
workclass_cats

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [83]:
from sklearn.preprocessing import OneHotEncoder

In [84]:
df_nonnumeric['workclass']
autoencoder = OneHotEncoder(categories='auto')
workclass = autoencoder.fit_transform(df_nonnumeric[['workclass']])
print(workclass)
workclass_array = pd.DataFrame(workclass.toarray())
workclass_array

  (0, 7)	1.0
  (1, 6)	1.0
  (2, 4)	1.0
  (3, 4)	1.0
  (4, 4)	1.0
  (5, 4)	1.0
  (6, 4)	1.0
  (7, 6)	1.0
  (8, 4)	1.0
  (9, 4)	1.0
  (10, 4)	1.0
  (11, 7)	1.0
  (12, 4)	1.0
  (13, 4)	1.0
  (14, 4)	1.0
  (15, 4)	1.0
  (16, 6)	1.0
  (17, 4)	1.0
  (18, 4)	1.0
  (19, 6)	1.0
  (20, 4)	1.0
  (21, 4)	1.0
  (22, 1)	1.0
  (23, 4)	1.0
  (24, 4)	1.0
  :	:
  (32536, 4)	1.0
  (32537, 4)	1.0
  (32538, 4)	1.0
  (32539, 0)	1.0
  (32540, 7)	1.0
  (32541, 0)	1.0
  (32542, 0)	1.0
  (32543, 2)	1.0
  (32544, 4)	1.0
  (32545, 2)	1.0
  (32546, 4)	1.0
  (32547, 4)	1.0
  (32548, 6)	1.0
  (32549, 7)	1.0
  (32550, 6)	1.0
  (32551, 4)	1.0
  (32552, 4)	1.0
  (32553, 4)	1.0
  (32554, 4)	1.0
  (32555, 4)	1.0
  (32556, 4)	1.0
  (32557, 4)	1.0
  (32558, 4)	1.0
  (32559, 4)	1.0
  (32560, 5)	1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32557,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32558,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32559,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [85]:
workclass_array.columns = workclass_cats
workclass_array

Unnamed: 0,State-gov,Self-emp-not-inc,Private,Federal-gov,Local-gov,?,Self-emp-inc,Without-pay,Never-worked
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32557,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32558,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32559,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [86]:
df_numeric

Unnamed: 0,salary,age,education-num,capital-gain,capital-loss,hours-per-week
0,<=50K,39,13,2174,0,40
1,<=50K,50,13,0,0,13
2,<=50K,38,9,0,0,40
3,<=50K,53,7,0,0,40
4,<=50K,28,13,0,0,40
...,...,...,...,...,...,...
32556,<=50K,27,12,0,0,38
32557,>50K,40,9,0,0,40
32558,<=50K,58,9,0,0,40
32559,<=50K,22,9,0,0,20


In [87]:
df_num_workclass = pd.concat([df_numeric, workclass_array], axis=1)
df_num_workclass

Unnamed: 0,salary,age,education-num,capital-gain,capital-loss,hours-per-week,State-gov,Self-emp-not-inc,Private,Federal-gov,Local-gov,?,Self-emp-inc,Without-pay,Never-worked
0,<=50K,39,13,2174,0,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,<=50K,50,13,0,0,13,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,<=50K,38,9,0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,<=50K,53,7,0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,<=50K,28,13,0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,<=50K,27,12,0,0,38,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32557,>50K,40,9,0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32558,<=50K,58,9,0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32559,<=50K,22,9,0,0,20,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [88]:
decisiontreemodel.fit(df_num_workclass.drop(['salary'], axis=1), df_num_workclass.salary)
decisiontreemodel.tree_.node_count

9959

In [89]:
# Creating Test Set + New Columns
golden['workclass']

golden['workclass']
## Already Created Autoencoder ## autoencoder = OneHotEncoder(categories='auto')
workclass_golden = autoencoder.fit_transform(golden[['workclass']])
print(workclass_golden)
workclass_golden_array = pd.DataFrame(workclass_golden.toarray())
workclass_golden_array.columns = workclass_cats
workclass_golden_array
golden_num_workclass = pd.concat([golden_numeric, workclass_golden_array], axis=1)
golden_num_workclass

  (0, 4)	1.0
  (1, 4)	1.0
  (2, 2)	1.0
  (3, 4)	1.0
  (4, 0)	1.0
  (5, 4)	1.0
  (6, 0)	1.0
  (7, 6)	1.0
  (8, 4)	1.0
  (9, 4)	1.0
  (10, 4)	1.0
  (11, 1)	1.0
  (12, 4)	1.0
  (13, 0)	1.0
  (14, 4)	1.0
  (15, 4)	1.0
  (16, 7)	1.0
  (17, 4)	1.0
  (18, 4)	1.0
  (19, 4)	1.0
  (20, 4)	1.0
  (21, 4)	1.0
  (22, 0)	1.0
  (23, 4)	1.0
  (24, 4)	1.0
  :	:
  (16256, 4)	1.0
  (16257, 5)	1.0
  (16258, 4)	1.0
  (16259, 4)	1.0
  (16260, 4)	1.0
  (16261, 4)	1.0
  (16262, 4)	1.0
  (16263, 4)	1.0
  (16264, 6)	1.0
  (16265, 2)	1.0
  (16266, 4)	1.0
  (16267, 4)	1.0
  (16268, 4)	1.0
  (16269, 4)	1.0
  (16270, 4)	1.0
  (16271, 4)	1.0
  (16272, 4)	1.0
  (16273, 4)	1.0
  (16274, 2)	1.0
  (16275, 4)	1.0
  (16276, 4)	1.0
  (16277, 0)	1.0
  (16278, 4)	1.0
  (16279, 4)	1.0
  (16280, 5)	1.0


Unnamed: 0,salary,age,education-num,capital-gain,capital-loss,hours-per-week,State-gov,Self-emp-not-inc,Private,Federal-gov,Local-gov,?,Self-emp-inc,Without-pay,Never-worked
0,<=50K,25,7,0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,<=50K,38,9,0,0,50,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,>50K,28,12,0,0,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,>50K,44,10,7688,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,<=50K,18,10,0,0,30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,<=50K,39,13,0,0,36,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
16277,<=50K,64,9,0,0,40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16278,<=50K,38,13,0,0,50,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
16279,<=50K,44,13,5455,0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [90]:
df_num_workclass_pred = decisiontreemodel.predict(golden_num_workclass.drop(['salary'], axis=1))

In [91]:
# Results with Workclass Columns
# Precision, Recall, and F1, 
print(classification_report(golden_num_workclass.salary, df_num_workclass_pred))
# Confusion Matrix
print(confusion_matrix(golden_num_workclass.salary, df_num_workclass_pred))

              precision    recall  f1-score   support

       <=50K       0.85      0.91      0.88     12435
        >50K       0.63      0.48      0.54      3846

    accuracy                           0.81     16281
   macro avg       0.74      0.70      0.71     16281
weighted avg       0.80      0.81      0.80     16281

[[11371  1064]
 [ 2016  1830]]


In [92]:
# Education
df_nonnumeric['education']
## autoencoder = OneHotEncoder(categories='auto')
education = autoencoder.fit_transform(df_nonnumeric[['education']])
## print(education)
education_array = pd.DataFrame(education.toarray())
ed_cats = pd.unique(df_nonnumeric['education'])
ed_cats
education_array.columns = ed_cats
education_array

df_num_ed = pd.concat([df_num_workclass, education_array], axis=1)
df_num_ed
# golden_num_ed = pd.concat([workclass_golden_array, education_array], axis=1)
# golden_num_ed

Unnamed: 0,salary,age,education-num,capital-gain,capital-loss,hours-per-week,State-gov,Self-emp-not-inc,Private,Federal-gov,...,Assoc-acdm,Assoc-voc,7th-8th,Doctorate,Prof-school,5th-6th,10th,1st-4th,Preschool,12th
0,<=50K,39,13,2174,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,<=50K,50,13,0,0,13,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,<=50K,38,9,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,<=50K,53,7,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,<=50K,28,13,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,<=50K,27,12,0,0,38,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32557,>50K,40,9,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32558,<=50K,58,9,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
32559,<=50K,22,9,0,0,20,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [93]:
# Creating Test Set + New Columns
golden['education']

## Already Created Autoencoder ## autoencoder = OneHotEncoder(categories='auto')
ed_golden = autoencoder.fit_transform(golden[['education']])
ed_golden_array = pd.DataFrame(ed_golden.toarray())
ed_golden_array.columns = ed_cats
ed_golden_array
golden_num_ed = pd.concat([golden_num_workclass, ed_golden_array], axis=1)
golden_num_ed

Unnamed: 0,salary,age,education-num,capital-gain,capital-loss,hours-per-week,State-gov,Self-emp-not-inc,Private,Federal-gov,...,Assoc-acdm,Assoc-voc,7th-8th,Doctorate,Prof-school,5th-6th,10th,1st-4th,Preschool,12th
0,<=50K,25,7,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,<=50K,38,9,0,0,50,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,>50K,28,12,0,0,40,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,>50K,44,10,7688,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,<=50K,18,10,0,0,30,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,<=50K,39,13,0,0,36,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
16277,<=50K,64,9,0,0,40,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
16278,<=50K,38,13,0,0,50,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
16279,<=50K,44,13,5455,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
decisiontreemodel.fit(df_num_ed.drop(['salary'], axis=1), df_num_ed.salary)
decisiontreemodel.tree_.node_count

9907

In [95]:
df_num_ed_pred = decisiontreemodel.predict(golden_num_ed.drop(['salary'], axis=1))
df_num_ed_pred

array([' <=50K', ' <=50K', ' <=50K', ..., ' >50K', ' >50K', ' >50K'],
      dtype=object)

In [96]:
# Precision, Recall, and F1, 
print(classification_report(golden_num_ed.salary, df_num_ed_pred))
# Confusion Matrix
print(confusion_matrix(golden_num_ed.salary, df_num_ed_pred))

              precision    recall  f1-score   support

       <=50K       0.85      0.91      0.88     12435
        >50K       0.63      0.47      0.54      3846

    accuracy                           0.81     16281
   macro avg       0.74      0.69      0.71     16281
weighted avg       0.80      0.81      0.80     16281

[[11364  1071]
 [ 2023  1823]]


In [98]:
# Education
df_nonnumeric['marital-status']

0              Never-married
1         Married-civ-spouse
2                   Divorced
3         Married-civ-spouse
4         Married-civ-spouse
                ...         
32556     Married-civ-spouse
32557     Married-civ-spouse
32558                Widowed
32559          Never-married
32560     Married-civ-spouse
Name: marital-status, Length: 32561, dtype: object