Dengan menggunakan data titanic:  
1. pilih feature yang akan digunakan untuk memprediksi apakah seseorang bisa selamat/tidak ['alive']
2. isi missing value pada feature jika ada (mean/median/modus)
3. buat skema preprocessing, jelaskan alasan skema kalian.
4. evaluasi performa 3 model berikut:
    * logistic regression
    * knn classifier
    * decision tree classifier
    

In [93]:
# Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Dataset

In [3]:
df = pd.read_csv('titanic.csv')
df

Unnamed: 0,sex,age,parch,fare,class,deck,embark_town,alive,alone
0,male,22.0,0,7.2500,Third,,Southampton,no,False
1,female,38.0,0,71.2833,First,C,Cherbourg,yes,False
2,female,26.0,0,7.9250,Third,,Southampton,yes,True
3,female,35.0,0,53.1000,First,C,Southampton,yes,False
4,male,35.0,0,8.0500,Third,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...
886,male,27.0,0,13.0000,Second,,Southampton,no,True
887,female,19.0,0,30.0000,First,B,Southampton,yes,True
888,female,,2,23.4500,Third,,Southampton,no,False
889,male,26.0,0,30.0000,First,C,Cherbourg,yes,True


In [4]:
df.isna().sum()

sex              0
age            177
parch            0
fare             0
class            0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [32]:
df['embark_town'].mode()

0    Southampton
dtype: object

In [6]:
from scipy.stats import normaltest
normaltest(df['age'].dropna())
#distribusi tidak normal

NormaltestResult(statistic=18.105032952089758, pvalue=0.00011709599657350757)

In [27]:
from numpy import nan
df['age'] = df['age'].fillna(df['age'].median())
df['embark_town'] = df['embark_town'].replace(nan,'Southampton')

In [28]:
df.isna().sum()

sex              0
age              0
parch            0
fare             0
class            0
deck           688
embark_town      0
alive            0
alone            0
dtype: int64

In [51]:
df['embark_town'].unique()

array(['Southampton', 'Cherbourg', 'Queenstown'], dtype=object)

# Preprocessing

1. OneHot encoding = sex, alone, alive
2. Ordinal encoding = class
3. Robust encoding = age, fare
4. Binary encoding = embark_town
5. No treatment = deck, parch  
target = alive

In [67]:
ordinal_mapping = [
    {'col':'class',
     'mapping':{None:0,'First':1,'Second':2,'Third':3}}
]

In [68]:
transformer = ColumnTransformer([
    ('one hot',OneHotEncoder(drop='first'),['sex','alone']), #karena pakai regresi (name, transform, kolom)
    ('ordinal', ce.OrdinalEncoder(mapping=ordinal_mapping),['class']),
    ('robust', RobustScaler(),['age','fare']),
    ('binary', ce.BinaryEncoder(),['embark_town'])
], remainder='passthrough')

In [82]:
pd.DataFrame(transformer.fit_transform(df))

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,0,3,-0.461538,-0.312011,0,0,1,0,,no
1,0,0,1,0.769231,2.46124,0,1,0,0,C,yes
2,0,1,3,-0.153846,-0.282777,0,0,1,0,,yes
3,0,0,1,0.538462,1.67373,0,0,1,0,C,yes
4,1,1,3,0.538462,-0.277363,0,0,1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...
886,1,1,2,-0.0769231,-0.0629807,0,0,1,0,,no
887,0,1,1,-0.692308,0.673281,0,0,1,0,B,yes
888,0,0,3,0,0.389604,0,0,1,2,,no
889,1,1,1,-0.153846,0.673281,0,1,0,0,C,yes


# Data Splitting

In [100]:
x = df.drop(columns=['alive','deck'])
y = df['alive'].replace({'yes':1,'no':0})

In [101]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state = 10)

# Data Transform

In [102]:
x_train.head()

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone
578,female,28.0,0,14.4583,Third,Cherbourg,False
323,female,22.0,1,29.0,Second,Southampton,False
654,female,18.0,0,6.75,Third,Queenstown,True
157,male,30.0,0,8.05,Third,Southampton,True
692,male,28.0,0,56.4958,Third,Southampton,True


In [103]:
x_train_preprocessed = transformer.fit_transform(x_train)
x_test_preprocessed = transformer.transform(x_test)

  elif pd.api.types.is_categorical(cols):


In [104]:
x_train_preprocessed = pd.DataFrame(x_train_preprocessed)
x_test_preprocessed = pd.DataFrame(x_test_preprocessed)

In [105]:
x_train_preprocessed

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,3.0,0.000000,-0.001777,0.0,0.0,1.0,0.0
1,0.0,0.0,2.0,-0.461538,0.618007,0.0,1.0,0.0,1.0
2,0.0,1.0,3.0,-0.769231,-0.330314,0.0,1.0,1.0,0.0
3,1.0,1.0,3.0,0.153846,-0.274907,0.0,1.0,0.0,0.0
4,1.0,1.0,3.0,0.000000,1.789912,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
663,0.0,1.0,1.0,-0.307692,2.335642,0.0,0.0,1.0,0.0
664,1.0,1.0,3.0,-0.461538,-0.309004,0.0,1.0,0.0,0.0
665,1.0,1.0,1.0,0.000000,8.834489,0.0,1.0,0.0,0.0
666,1.0,0.0,3.0,-1.230769,-0.138873,0.0,0.0,1.0,0.0


In [106]:
transformer.transformers_

[('one hot', OneHotEncoder(drop='first'), ['sex', 'alone']),
 ('ordinal',
  OrdinalEncoder(cols=['class'],
                 mapping=[{'col': 'class',
                           'mapping': {None: 0, 'First': 1, 'Second': 2,
                                       'Third': 3}}]),
  ['class']),
 ('robust', RobustScaler(), ['age', 'fare']),
 ('binary', BinaryEncoder(), ['embark_town']),
 ('remainder', 'passthrough', [2])]

In [107]:
transformer.transformers_[0][1].get_feature_names()

array(['x0_male', 'x1_True'], dtype=object)

In [108]:
transformer.transformers_[1][1].get_feature_names()

['class']

In [109]:
transformer.transformers_[3][1].get_feature_names()

['embark_town_0', 'embark_town_1', 'embark_town_2']

In [110]:
features = list(transformer.transformers_[0][1].get_feature_names()) + transformer.transformers_[1][1].get_feature_names() + ['age', 'fare'] + transformer.transformers_[3][1].get_feature_names() + ['parch']
features

['x0_male',
 'x1_True',
 'class',
 'age',
 'fare',
 'embark_town_0',
 'embark_town_1',
 'embark_town_2',
 'parch']

In [111]:
x_train_preprocessed.columns = features
x_test_preprocessed.columns = features

In [112]:
x_train_preprocessed

Unnamed: 0,x0_male,x1_True,class,age,fare,embark_town_0,embark_town_1,embark_town_2,parch
0,0.0,0.0,3.0,0.000000,-0.001777,0.0,0.0,1.0,0.0
1,0.0,0.0,2.0,-0.461538,0.618007,0.0,1.0,0.0,1.0
2,0.0,1.0,3.0,-0.769231,-0.330314,0.0,1.0,1.0,0.0
3,1.0,1.0,3.0,0.153846,-0.274907,0.0,1.0,0.0,0.0
4,1.0,1.0,3.0,0.000000,1.789912,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
663,0.0,1.0,1.0,-0.307692,2.335642,0.0,0.0,1.0,0.0
664,1.0,1.0,3.0,-0.461538,-0.309004,0.0,1.0,0.0,0.0
665,1.0,1.0,1.0,0.000000,8.834489,0.0,1.0,0.0,0.0
666,1.0,0.0,3.0,-1.230769,-0.138873,0.0,0.0,1.0,0.0


# Model Fitting and evaluation


In [130]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## Logistic Regression

In [138]:
model = LogisticRegression()
model.fit(x_train_preprocessed, y_train)

LogisticRegression()

In [139]:
accuracy_score(y_test,model.predict(x_test_preprocessed)) 

0.8385650224215246

## KNN Classifier

In [128]:
knn = KNeighborsClassifier(n_neighbors=3) 
knn.fit(x_train_preprocessed,y_train)

KNeighborsClassifier(n_neighbors=3)

In [129]:
y_predict = knn.predict(x_test_preprocessed)
accuracy_score(y_test,y_predict)

0.820627802690583

## Decision Tree Classifier

In [144]:
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(x_train_preprocessed,y_train)

DecisionTreeClassifier(max_depth=3)

In [145]:
y_predict = tree.predict(x_test_preprocessed)
print('mse:', mean_squared_error(y_test,y_pred))

mse: 0.11332850291226242


In [146]:
accuracy_score(y_test,y_predict)

0.8340807174887892

## Decision Tree Regressor

In [147]:
tree = DecisionTreeRegressor(max_depth=3) 
tree.fit(x_train_preprocessed,y_train)

DecisionTreeRegressor(max_depth=3)

In [148]:
y_pred = tree.predict(x_test_preprocessed)
print('mse:', mean_squared_error(y_test,y_pred))

mse: 0.11332850291226242


In [149]:
accuracy_score(y_test,y_predict)

0.8340807174887892