data : adult.csv  
* target : income  
preprocess:  
* missing value : simple imputer with constant  
* one hot encoding : relationship, race, sex  
* binary encoding : workclass, marital status, occupation, native country    
* ordinal encoding : education (already encoded)  
* no treatment : numerical  
out : fnlwgt  
Random state 10, data splitting 70:30 model Tree(max depth 5,  criterion entropy)

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data

An individual’s annual income results from various factors. Intuitively, it is influenced by the individual’s education level, age, gender, occupation, and etc.
<br>
Fields:
<br>
The dataset contains 16 columns
<br>
Target filed: Income
<br>
-- The income is divide into two classes: 50K
<br>
Number of attributes: 14
<br>
-- These are the demographics and other features to describe a person

We will explore the possibility in predicting income level based on the individual’s personal information.


In [14]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [15]:
df['income'].value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

### Missing Value
Missing value in this dataset marked with "?"

In [16]:
df = df.replace('?', np.nan)

In [17]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

### Preprocessing

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [19]:
pipe_impute_binary = Pipeline([
    ('imputer',SimpleImputer(strategy = 'constant',fill_value = 'NA')),
    ('binary encoder',ce.BinaryEncoder())
])

transformer = ColumnTransformer([
    ('one hot encoder',OneHotEncoder(drop = 'first'),['relationship','race','sex']),
    ('binary encoder',pipe_impute_binary,['workclass','marital.status','occupation','native.country'])
], remainder = 'passthrough')

### Splitting Data

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x = df.drop(columns=['fnlwgt', 'income', 'education']) #education di drop karena sudah terwakili education.num
y = np.where(df['income']=='>50K', 1, 0)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y, stratify=y, random_state=10)

### Data Tramsform

In [23]:
x_train.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
13761,63,Local-gov,9,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
7085,47,Private,13,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,El-Salvador
21420,90,Private,4,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
2774,64,Self-emp-not-inc,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,7298,0,45,United-States
21708,33,Private,13,Separated,Sales,Not-in-family,White,Female,0,0,50,United-States


In [24]:
x_train_preprocessed = transformer.fit_transform(x_train)
x_test_preprocessed = transformer.transform(x_test)

  elif pd.api.types.is_categorical(cols):


In [25]:
x_train_preprocessed = pd.DataFrame(x_train_preprocessed)
x_test_preprocessed = pd.DataFrame(x_test_preprocessed)

In [26]:
x_train_preprocessed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,63.0,9.0,0.0,0.0,40.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,47.0,13.0,0.0,0.0,40.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,90.0,4.0,0.0,0.0,40.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,64.0,9.0,7298.0,0.0,45.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,33.0,13.0,0.0,0.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24415,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,25.0,10.0,0.0,0.0,35.0
24416,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,51.0,15.0,3325.0,0.0,40.0
24417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,60.0,11.0,0.0,0.0,40.0
24418,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,23.0,10.0,0.0,0.0,30.0


In [27]:
transformer.transformers_

[('one hot encoder',
  OneHotEncoder(drop='first'),
  ['relationship', 'race', 'sex']),
 ('binary encoder',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='NA', strategy='constant')),
                  ('binary encoder', BinaryEncoder())]),
  ['workclass', 'marital.status', 'occupation', 'native.country']),
 ('remainder', 'passthrough', [0, 2, 8, 9, 10])]

In [28]:
transformer.transformers_[0][1].get_feature_names()

array(['x0_Not-in-family', 'x0_Other-relative', 'x0_Own-child',
       'x0_Unmarried', 'x0_Wife', 'x1_Asian-Pac-Islander', 'x1_Black',
       'x1_Other', 'x1_White', 'x2_Male'], dtype=object)

In [29]:
transformer.transformers_[1][1]['binary encoder'].get_feature_names()

['0_0',
 '0_1',
 '0_2',
 '0_3',
 '0_4',
 '1_0',
 '1_1',
 '1_2',
 '1_3',
 '2_0',
 '2_1',
 '2_2',
 '2_3',
 '2_4',
 '3_0',
 '3_1',
 '3_2',
 '3_3',
 '3_4',
 '3_5',
 '3_6']

In [30]:
features = list(transformer.transformers_[0][1].get_feature_names()) + transformer.transformers_[1][1]['binary encoder'].get_feature_names() + ['age','education.num','capital.gain','capital.loss','hours.per.week']
len(features)

36

In [31]:
x_train_preprocessed.columns = features
x_test_preprocessed.columns = features

In [32]:
x_train_preprocessed

Unnamed: 0,x0_Not-in-family,x0_Other-relative,x0_Own-child,x0_Unmarried,x0_Wife,x1_Asian-Pac-Islander,x1_Black,x1_Other,x1_White,x2_Male,...,3_2,3_3,3_4,3_5,3_6,age,education.num,capital.gain,capital.loss,hours.per.week
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,63.0,9.0,0.0,0.0,40.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,47.0,13.0,0.0,0.0,40.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,90.0,4.0,0.0,0.0,40.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,64.0,9.0,7298.0,0.0,45.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,33.0,13.0,0.0,0.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24415,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,25.0,10.0,0.0,0.0,35.0
24416,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,51.0,15.0,3325.0,0.0,40.0
24417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,60.0,11.0,0.0,0.0,40.0
24418,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,23.0,10.0,0.0,0.0,30.0


### Model Fitting and Evaluation

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [34]:
tree = DecisionTreeClassifier(max_depth = 5, criterion='entropy')
tree.fit(x_train_preprocessed, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [35]:
y_predict = tree.predict(x_test_preprocessed)
print('mse:', mean_squared_error(y_test,y_predict))
print('akurasi:',accuracy_score(y_test,y_predict))

mse: 0.15772018179584818
akurasi: 0.8422798182041519


### With Polynomial

In [42]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

In [38]:
poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
x_train_poly = poly.fit_transform(x_train_preprocessed)
x_test_poly = poly.fit_transform(x_test_preprocessed)

In [39]:
len(poly.get_feature_names())

9138

In [40]:
pd.DataFrame(x_train_poly, columns=poly.get_feature_names())

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x33^3,x33^2 x34,x33^2 x35,x33 x34^2,x33 x34 x35,x33 x35^2,x34^3,x34^2 x35,x34 x35^2,x35^3
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,64000.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,64000.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,64000.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3.886973e+11,0.0,2.396736e+09,0.0,0.0,14778450.0,0.0,0.0,0.0,91125.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,125000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24415,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,42875.0
24416,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3.675995e+10,0.0,4.422250e+08,0.0,0.0,5320000.0,0.0,0.0,0.0,64000.0
24417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,64000.0
24418,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,27000.0


In [43]:
logreg = LogisticRegression(max_iter=100000)
logreg.fit(x_train_poly, y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=100000)

In [44]:
y_pred = logreg.predict(x_test_poly)
print('akurasi:', accuracy_score(y_test, y_pred))

akurasi: 0.7592433361994841
