# **1. Import and EDA**

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

# train test split
from sklearn.model_selection import train_test_split

# impute missing values
from sklearn.impute import SimpleImputer # mean, median, most_frequent (mode), constant
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # regresi
from sklearn.impute import KNNImputer # regresi KKN

# encoding
from sklearn.preprocessing import OneHotEncoder
from category_encoders import OrdinalEncoder, BinaryEncoder

# scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler 

# column transformer & pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE 
from imblearn.under_sampling import RandomUnderSampler, NearMiss 

# cross validation
from sklearn.model_selection import cross_val_score

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [4]:
#Defining Function

def dataDescription(df):
    tempList = []
    for col in df.columns:
        tempList.append(
            [col,
            df[col].dtype,
            df[col].isna().sum(),
            round(df[col].isna().sum()/len(df)*100,2),
            df[col].nunique(),
            #list(df[col].drop_duplicates().sample(5,replace=True).values)
            list(df[col].drop_duplicates().sort_values().values)
            ]
        )

    descData = pd.DataFrame(data = tempList,
                            columns = ['Col','Data Type','Missing Value', 'Pct Missing Value','Num Unique','Unique Sample']
                            )
    display(descData)

def normalCheckShapiro(data):

    _, p_value = stats.shapiro(data)

    alpha = 0.05
    if p_value > alpha:
        print("The data is normally distributed.")
    else:
        print("The data is not normally distributed.")


In [2]:
# load dataset
df = pd.read_csv('adult.csv')
df.head() 

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [7]:
#Descriptive Analysis
display(df.info(),df.describe(),df.isnull().sum(),df.head(),dataDescription(df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,Col,Data Type,Missing Value,Pct Missing Value,Num Unique,Unique Sample
0,age,int64,0,0.0,73,"[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 2..."
1,workclass,object,0,0.0,9,"[?, Federal-gov, Local-gov, Never-worked, Priv..."
2,fnlwgt,int64,0,0.0,21648,"[12285, 13769, 14878, 18827, 19214, 19302, 193..."
3,education,object,0,0.0,16,"[10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, ..."
4,education.num,int64,0,0.0,16,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
5,marital.status,object,0,0.0,7,"[Divorced, Married-AF-spouse, Married-civ-spou..."
6,occupation,object,0,0.0,15,"[?, Adm-clerical, Armed-Forces, Craft-repair, ..."
7,relationship,object,0,0.0,6,"[Husband, Not-in-family, Other-relative, Own-c..."
8,race,object,0,0.0,5,"[Amer-Indian-Eskimo, Asian-Pac-Islander, Black..."
9,sex,object,0,0.0,2,"[Female, Male]"


None

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


None

# **2. Cleaning and Pre-Processing Schema**

In [8]:
#Change '?' to missing value
df.replace('?',np.nan, inplace=True)
dataDescription(df)

Unnamed: 0,Col,Data Type,Missing Value,Pct Missing Value,Num Unique,Unique Sample
0,age,int64,0,0.0,73,"[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 2..."
1,workclass,object,1836,5.64,8,"[Federal-gov, Local-gov, Never-worked, Private..."
2,fnlwgt,int64,0,0.0,21648,"[12285, 13769, 14878, 18827, 19214, 19302, 193..."
3,education,object,0,0.0,16,"[10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, ..."
4,education.num,int64,0,0.0,16,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
5,marital.status,object,0,0.0,7,"[Divorced, Married-AF-spouse, Married-civ-spou..."
6,occupation,object,1843,5.66,14,"[Adm-clerical, Armed-Forces, Craft-repair, Exe..."
7,relationship,object,0,0.0,6,"[Husband, Not-in-family, Other-relative, Own-c..."
8,race,object,0,0.0,5,"[Amer-Indian-Eskimo, Asian-Pac-Islander, Black..."
9,sex,object,0,0.0,2,"[Female, Male]"


In [None]:
#Pre Processing Schema
BE_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value='NC')),
    ('BE', ce.BinaryEncoder())
])

transformer = ColumnTransformer(
    [
        ('OHE', OneHotEncoder(drop='first'), ['relationship','race','sex']),
        ('Binary Enc', BE_pipeline,['workclass','marital.status','occupation','native.country'])
    ],
    remainder='passthrough' #LEWATKAN YANG GAK DIMENTION
)