# **Recap Modeling Process**

**CONTENT :**
1. Load Data
2. Prepare Data
3. Built Pre-Processing
4. Built Model (KNN, LogReg, DT)
5. Best Model (Based on Accuracy)

**Target : Income Prediction**

In [128]:
# Library

#Standard
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

#Data Pre-Processing
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler, StandardScaler

#Data Modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

#Data Output
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline #menggabungkan proces

warnings.filterwarnings('ignore')

In [129]:
#Defining Function

def dataDescription(df):
    tempList = []
    for col in df.columns:
        tempList.append(
            [col,
            df[col].dtype,
            df[col].isna().sum(),
            round(df[col].isna().sum()/len(df)*100,2),
            df[col].nunique(),
            #list(df[col].drop_duplicates().sample(5,replace=True).values)
            list(df[col].drop_duplicates().sort_values().values)
            ]
        )

    descData = pd.DataFrame(data = tempList,
                            columns = ['Col','Data Type','Missing Value', 'Pct Missing Value','Num Unique','Unique Sample']
                            )
    display(descData)

## **Data Pre-Processing**

In [130]:
#Load Data
df = pd.read_csv('adult.csv')
display(df.info(),df.describe(),df.describe(exclude='number'),df.isnull().sum(),df.head(),dataDescription(df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


None

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


**SKEMA PRE-PROCESSING**
1. Missing Value : Fill Value 'NC'
2. One Hot Encoding : Relationship, Race, Sex
3. Binary Encoding: Workclass, Marital Status, Occupation, Native Country
4. Ordinal Encoding: Education (Already Encdoed)
5. No Treatment : Size
6. Take Out : fnlwgt

In [132]:
#Change '?' to missing value
df.replace('?',np.nan, inplace=True)
dataDescription(df)

Unnamed: 0,Col,Data Type,Missing Value,Pct Missing Value,Num Unique,Unique Sample
0,age,int64,0,0.0,73,"[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 2..."
1,workclass,object,1836,5.64,8,"[Federal-gov, Local-gov, Never-worked, Private..."
2,fnlwgt,int64,0,0.0,21648,"[12285, 13769, 14878, 18827, 19214, 19302, 193..."
3,education,object,0,0.0,16,"[10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, ..."
4,education.num,int64,0,0.0,16,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
5,marital.status,object,0,0.0,7,"[Divorced, Married-AF-spouse, Married-civ-spou..."
6,occupation,object,1843,5.66,14,"[Adm-clerical, Armed-Forces, Craft-repair, Exe..."
7,relationship,object,0,0.0,6,"[Husband, Not-in-family, Other-relative, Own-c..."
8,race,object,0,0.0,5,"[Amer-Indian-Eskimo, Asian-Pac-Islander, Black..."
9,sex,object,0,0.0,2,"[Female, Male]"


In [133]:
#Skema Pre-Processing
BE_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value='NC')),
    ('BE', ce.BinaryEncoder())
])

transformer = ColumnTransformer(
    [
        ('OHE', OneHotEncoder(drop='first'), ['relationship','race','sex']),
        ('Binary Enc', BE_pipeline,['workclass','marital.status','occupation','native.country'])
    ],
    remainder='passthrough' #LEWATKAN YANG GAK DIMENTION
)

In [134]:
transformer

In [135]:
# Data Spliting

x = df.drop(columns=['fnlwgt','education','income'])
y = np.where(df['income']=='>50K',1,0)

xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y,
    stratify= y,
    random_state=100,
    test_size= 0.2
)

In [136]:
# Transforming Data

xtrain_prepros = transformer.fit_transform(xtrain)
xtest_prepros = transformer.transform(xtest)
xtrain_prepros

array([[1.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.500e+01],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 7.298e+03, 0.000e+00,
        4.000e+01],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        4.000e+01],
       ...,
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        5.000e+01],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        4.000e+01],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        4.000e+01]])

In [137]:
xtrain_prepros = pd.DataFrame(xtrain_prepros)
xtest_prepros = pd.DataFrame(xtest_prepros)
xtrain_prepros

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,23.0,13.0,0.0,0.0,15.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,36.0,10.0,7298.0,0.0,40.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,33.0,9.0,0.0,0.0,40.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,51.0,13.0,0.0,1902.0,40.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,33.0,6.0,0.0,0.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,33.0,11.0,0.0,0.0,40.0
26044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,30.0,9.0,0.0,0.0,40.0
26045,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,27.0,10.0,0.0,0.0,50.0
26046,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,27.0,10.0,0.0,0.0,40.0


In [138]:
feature = list(transformer.transformers_[0][1].get_feature_names_out()) + list(transformer.transformers_[1][1]['BE'].get_feature_names_out()) + ['age', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']

xtrain_prepros.columns = feature
xtest_prepros.columns = feature

xtrain_prepros



Unnamed: 0,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male,...,3_1,3_2,3_3,3_4,3_5,age,education.num,capital.gain,capital.loss,hours.per.week
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,23.0,13.0,0.0,0.0,15.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,36.0,10.0,7298.0,0.0,40.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,33.0,9.0,0.0,0.0,40.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,51.0,13.0,0.0,1902.0,40.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,33.0,6.0,0.0,0.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,33.0,11.0,0.0,0.0,40.0
26044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,30.0,9.0,0.0,0.0,40.0
26045,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,27.0,10.0,0.0,0.0,50.0
26046,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,27.0,10.0,0.0,0.0,40.0


In [139]:
# GET BEST MODEL

#Logistics Regression
logreg = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=10)
tree = DecisionTreeClassifier(max_depth=10)

acc = []

for model in [logreg,knn,tree] :
    model.fit(xtrain_prepros,ytrain)
    ypred = model.predict(xtest_prepros)
    acc.append(accuracy_score(ytest,ypred))

pd.DataFrame({
    'Model' : ['Logistic Regression','KNN', 'Decission Tree'],
    'Accuracy Score': acc
}).sort_values('Accuracy Score', ascending=False)

Unnamed: 0,Model,Accuracy Score
2,Decission Tree,0.855213
1,KNN,0.849685
0,Logistic Regression,0.839398


In [141]:
model.predict_proba(xtest_prepros)

array([[9.99420290e-01, 5.79710145e-04],
       [5.19926426e-01, 4.80073574e-01],
       [5.19926426e-01, 4.80073574e-01],
       ...,
       [7.72727273e-01, 2.27272727e-01],
       [9.99420290e-01, 5.79710145e-04],
       [9.69696970e-01, 3.03030303e-02]])

In [144]:

# UNTUK MENGECEK FEATURE PALINGBBERPENGARUH
imp_table = pd.DataFrame({
    'importance Value' : model.feature_importances_
}, index=feature).sort_values('importance Value',ascending=False)

imp_table

Unnamed: 0,importance Value
1_1,0.327959
education.num,0.236438
capital.gain,0.199495
capital.loss,0.072484
age,0.053203
hours.per.week,0.04412
1_2,0.016612
2_0,0.00966
relationship_Wife,0.006725
2_1,0.004662


#