In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [38]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [39]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.NaN if x=="?" else x)

In [40]:
df.describe(include="all")

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
count,32561.0,30725,32561.0,32561,32561.0,32561,30718,32561,32561,32561,32561.0,32561.0,32561.0,31978,32561
unique,,8,,16,,7,14,6,5,2,,,,41,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


Melakukan preprocess untuk kolom-kolom berikut dengan encoding:
- education
- marital.status
- relationship
- race
- sex

In [41]:
adult_encoded = df.copy()
adult_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


Encoding kolom Education menggunakan OneHotEncoder

In [42]:
education_encoder = OneHotEncoder(sparse_output=False)
education_encoder.fit(pd.DataFrame(adult_encoded['education']))

In [43]:
education_encoder_result = education_encoder.transform(pd.DataFrame(adult_encoded['education']))
print(education_encoder_result)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [44]:
education_encoder.categories_

[array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
        'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
        'Masters', 'Preschool', 'Prof-school', 'Some-college'],
       dtype=object)]

In [45]:
education_encoder_result_df = pd.DataFrame(education_encoder_result, columns=education_encoder.categories_)
education_encoder_result_df.head()

Unnamed: 0,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Mencoba melakukan Encoding kolom Education menggunakan Label Encoder

In [46]:
education_label_encoder = LabelEncoder()
education_label_encoder.fit(adult_encoded['education'])

In [47]:
adult_encoded['education'] = education_label_encoder.transform(adult_encoded['education'])
adult_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,11,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,11,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,15,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,5,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,15,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [48]:
inverse_education_label_encoder_result = education_label_encoder.inverse_transform(adult_encoded['education'])
print(inverse_education_label_encoder_result)

['HS-grad' 'HS-grad' 'Some-college' ... 'HS-grad' 'HS-grad' 'HS-grad']


Encoding kolom marital.status menggunakan OneHotEncoder

In [49]:
marital_status_one_hot_encoder = OneHotEncoder(sparse_output=False)
marital_status_one_hot_encoder.fit(pd.DataFrame(adult_encoded['marital.status']))

In [50]:
marital_status_one_hot_encoder_result = marital_status_one_hot_encoder.transform(pd.DataFrame(adult_encoded['marital.status']))
print(marital_status_one_hot_encoder_result)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 1. 0. 0.]]


In [51]:
marital_status_one_hot_encoder.categories_

[array(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
        'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'],
       dtype=object)]

In [52]:
marital_status_one_hot_encoder_result_df = pd.DataFrame(marital_status_one_hot_encoder_result, columns=marital_status_one_hot_encoder.categories_)
marital_status_one_hot_encoder_result_df.head()

Unnamed: 0,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,Separated,Widowed
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Mencoba melakukan Encoding kolom marital.status menggunakan Label Encoder

In [53]:
marital_status_label_encoder = LabelEncoder()
marital_status_label_encoder.fit(adult_encoded['marital.status'])

In [54]:
adult_encoded['marital.status'] = marital_status_label_encoder.transform(adult_encoded['marital.status'])
adult_encoded

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,11,9,6,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,11,9,6,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,15,10,6,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,5,4,0,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,15,10,5,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,15,10,4,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,7,12,2,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,11,9,2,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,11,9,6,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [55]:
marital_status_label_encoder_result = marital_status_label_encoder.inverse_transform(adult_encoded['marital.status'])
print(marital_status_label_encoder_result)

['Widowed' 'Widowed' 'Widowed' ... 'Married-civ-spouse' 'Widowed'
 'Never-married']


Encoding kolom relationship menggunakan OneHotEncoder

In [56]:
relationship_one_hot_encoder = OneHotEncoder(sparse_output=False)
relationship_one_hot_encoder.fit(pd.DataFrame(adult_encoded['relationship']))

In [57]:
relationship_one_hot_encoder_result = relationship_one_hot_encoder.transform(pd.DataFrame(adult_encoded['relationship']))
print(relationship_one_hot_encoder_result)

[[0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0.]]


In [58]:
relationship_one_hot_encoder.categories_

[array(['Husband', 'Not-in-family', 'Other-relative', 'Own-child',
        'Unmarried', 'Wife'], dtype=object)]

In [59]:
relationship_one_hot_encoder_result_df = pd.DataFrame(relationship_one_hot_encoder_result, columns=relationship_one_hot_encoder.categories_)
relationship_one_hot_encoder_result_df.head()

Unnamed: 0,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0


Encoding kolom race menggunakan OneHotEncoder

In [60]:
race_one_hot_encoder = OneHotEncoder(sparse_output=False)
race_one_hot_encoder.fit(pd.DataFrame(adult_encoded['race']))

In [61]:
race_one_hot_encoder_result = race_one_hot_encoder.transform(pd.DataFrame(adult_encoded['race']))
print(race_one_hot_encoder_result)

[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [62]:
race_one_hot_encoder.categories_

[array(['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other',
        'White'], dtype=object)]

In [63]:
race_one_hot_encoder_result_df = pd.DataFrame(race_one_hot_encoder_result, columns=race_one_hot_encoder.categories_)
race_one_hot_encoder_result_df.head()

Unnamed: 0,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0


Encoding kolom sex menggunakan Label Binarizer

In [64]:
sex_label_binarizer = LabelBinarizer()
sex_label_binarizer.fit(adult_encoded['sex'])

In [65]:
adult_encoded['sex'] = sex_label_binarizer.transform(adult_encoded['sex'])
adult_encoded

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,11,9,6,,Not-in-family,White,0,0,4356,40,United-States,<=50K
1,82,Private,132870,11,9,6,Exec-managerial,Not-in-family,White,0,0,4356,18,United-States,<=50K
2,66,,186061,15,10,6,,Unmarried,Black,0,0,4356,40,United-States,<=50K
3,54,Private,140359,5,4,0,Machine-op-inspct,Unmarried,White,0,0,3900,40,United-States,<=50K
4,41,Private,264663,15,10,5,Prof-specialty,Own-child,White,0,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,15,10,4,Protective-serv,Not-in-family,White,1,0,0,40,United-States,<=50K
32557,27,Private,257302,7,12,2,Tech-support,Wife,White,0,0,0,38,United-States,<=50K
32558,40,Private,154374,11,9,2,Machine-op-inspct,Husband,White,1,0,0,40,United-States,>50K
32559,58,Private,151910,11,9,6,Adm-clerical,Unmarried,White,0,0,0,40,United-States,<=50K


In [73]:
workclass_label_binarizer = LabelBinarizer()
workclass_label_binarizer.fit(adult_encoded['workclass'])

ValueError: Unknown label type: (0            NaN
1        Private
2            NaN
3        Private
4        Private
          ...   
32556    Private
32557    Private
32558    Private
32559    Private
32560    Private
Name: workclass, Length: 32561, dtype: object,)

In [66]:
invesre_sex_label_binarizer = sex_label_binarizer.inverse_transform(adult_encoded['sex'])
invesre_sex_label_binarizer

array(['Female', 'Female', 'Female', ..., 'Male', 'Female', 'Male'],
      dtype='<U6')

In [67]:
X = adult_encoded.drop('income', axis=1)
y = adult_encoded['income']

In [71]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,,77053,11,9,6,,Not-in-family,White,0,0,4356,40,United-States
1,82,Private,132870,11,9,6,Exec-managerial,Not-in-family,White,0,0,4356,18,United-States
2,66,,186061,15,10,6,,Unmarried,Black,0,0,4356,40,United-States
3,54,Private,140359,5,4,0,Machine-op-inspct,Unmarried,White,0,0,3900,40,United-States
4,41,Private,264663,15,10,5,Prof-specialty,Own-child,White,0,0,3900,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,15,10,4,Protective-serv,Not-in-family,White,1,0,0,40,United-States
32557,27,Private,257302,7,12,2,Tech-support,Wife,White,0,0,0,38,United-States
32558,40,Private,154374,11,9,2,Machine-op-inspct,Husband,White,1,0,0,40,United-States
32559,58,Private,151910,11,9,6,Adm-clerical,Unmarried,White,0,0,0,40,United-States


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)

In [70]:
tree_model = DecisionTreeClassifier(max_depth=3, random_state=10)
tree_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'Private'