In [1]:
# Import library pengolahan struktur data
import pandas as pd

# Import library pengolahan angka
import numpy as np

In [2]:
# Load Data
# Simpan dengan nama bank_df
bank_df = pd.read_csv("bank-data.csv")

In [3]:
# Tampilkan seluruh data
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [4]:
bank_df.shape

# Output
# (Jumlah observasi, jumlah kolom/fitur)

(45211, 17)

In [5]:
# cek data duplicate
duplicate_status = bank_df.duplicated()
duplicate_status

0        False
1        False
2        False
3        False
4        False
         ...  
45206    False
45207    False
45208    False
45209    False
45210    False
Length: 45211, dtype: bool

In [6]:
# Cari jumlah data duplikatnya
duplicate_status.sum()

# FALSE = 0 --> kalo tidak duplikat 
# TRUE = 1 --> kalo duplikat
# Kalau ada yang duplikat, maka jumlahnya > 0

0

In [7]:
bank_df = bank_df.drop_duplicates()

# Tidak ada yang di-drop karena tidak ada duplikat

In [8]:
bank_df.shape

# Selalu sanity check!
# Periksa ulang jumlah observasi

(45211, 17)

**Buat semuanya dalam fungsi**

1. Import data
2. Cek **Jumlah observasi** dan **Jumlah kolom**
3. Drop duplicate
4. Cek **Jumlah observasi** dan **Jumlah kolom** setelah di-drop
5. Return data setelah di-drop

In [9]:
# Kita ingin membuat fungsi yang isi perintahnya sebagai berikut
bank_df = pd.read_csv("bank-data.csv")
print("Data asli            : ", bank_df.shape, "- (#observasi, #kolom)")

bank_df = bank_df.drop_duplicates()
print("Data setelah di-drop : ", bank_df.shape, "- (#observasi, #kolom)")

Data asli            :  (45211, 17) - (#observasi, #kolom)
Data setelah di-drop :  (45211, 17) - (#observasi, #kolom)


In [10]:
def importData(filename):
    """
    Fungsi untuk import data & hapus duplikat
    :param filename: <string> nama file input (format .csv)
    :return df: <pandas dataframe> sampel data
    """

    # read data
    df = pd.read_csv(filename)
    print("Data asli            : ", df.shape, "- (#observasi, #kolom)")

    # drop duplicates
    df = df.drop_duplicates()
    print("Data setelah di-drop : ", df.shape, "- (#observasi, #kolom)")

    return df

# (filename) adalah argumen
# Argumen adalah sebuah variable. 
# Jika fungsi tsb. diberi argumen filename = "bank_data.csv", 
# maka semua variabel 'filename' di dalam fungsi 
# akan berubah menjadi "bank_data.csv"

In [11]:
# input
file_bank = "bank-data.csv"

# panggil fungsi
bank_df = importData(filename = file_bank)

Data asli            :  (45211, 17) - (#observasi, #kolom)
Data setelah di-drop :  (45211, 17) - (#observasi, #kolom)


In [12]:
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


## <b><font color='blue'> 2. Data Preprocessing:</font></b>
---
    * Input-Output Split, Train-Test Split
    * Processing Categorical
    * Imputation, Normalization, Drop Duplicates

### **Input-Output Split**

- Fitur `y` adalah output variabel dari data marketing
- yang lainnya menjadi input

**Buat data output**

In [13]:
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [14]:
output_data = bank_df["y"]

# buat data yang berisi data target
# pilih data dengan nama kolom `y`, lalu namakan sebagai output_data

In [15]:
output_data.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

**Buat data input**

- DATA = INPUT + OUTPUT
- DATA - OUTPUT = INPUT
- Jadi kalau dari data, kita drop VARIABLE OUTPUT, maka tersisa hanya variabel INPUT.

In [16]:
input_data = bank_df.drop(["y"], 
                          axis = 1)

In [17]:
input_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


**Buat semuanya jadi fungsi**
1. buat output_data
2. buat input_data
3. return input_data dan output_data

In [18]:
# isi perintah yang akan dimasukkan ke dalam fungsi
output_data = bank_df["y"]
input_data = bank_df.drop("y",
                          axis = 1)

In [19]:
def extractInputOutput(data,
                       output_column_name):
    """
    Fungsi untuk memisahkan data input dan output
    :param data: <pandas dataframe> data seluruh sample
    :param output_column_name: <string> nama kolom output
    :return input_data: <pandas dataframe> data input
    :return output_data: <pandas series> data output
    """
    output_data = data[output_column_name]
    input_data = data.drop(output_column_name,
                           axis = 1)
    
    return input_data, output_data

# (data, output_column_name) adalah argumen
# Argumen adalah sebuah variable. 
# Jika fungsi tsb. diberi argumen data = bank_df, 
# maka semua variabel 'data' di dalam fungsi akan berubah menjadi bank_df

In [20]:
# Jangan sampai salah urutan dalam penempatan return
X, y = extractInputOutput(data = bank_df,
                          output_column_name = "y")

In [21]:
X.head(2)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown


In [22]:
y.head(2)

0    0
1    0
Name: y, dtype: int64

### **Train-Test Split**

- **Kenapa?**
  - Karena tidak mau overfit data training
  - Test data akan menjadi future data
  - Kita akan latih model ML di data training, dengan CV (Cross-validation)
  - Selanjutnya melakukan evaluasi di data testing

In [23]:
# Import train-test splitting library dari sklearn (scikit learn)
from sklearn.model_selection import train_test_split

**Train Test Split Function**
1. `X` adalah input
2. `y` adalah output (target)
3. `test_size` adalah seberapa besar proporsi data test dari keseluruhan data. Contoh `test_size = 0.2` artinya data test akan berisi 20% data.
4. `random_state` adalah kunci untuk random. Harus di-setting sama. Misal `random_state = 123`.
5. Output:
   - `X_train` = input dari data training
   - `X_test` = input dari data testing
   - `y_train` = output dari data training
   - `y_test` = output dari data testing
6. Urutan outputnya: `X_train, X_test, y_train, y_test`. Tidak boleh terbalik

> Readmore: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [24]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.25,
                                                    random_state = 12)

In [25]:
# Sanity check hasil splitting
print(X_train.shape)
print(X_test.shape)

(33908, 16)
(11303, 16)


In [26]:
# Ratio
X_test.shape[0] / X.shape[0]

# Hasil 0.25 - sesuai dengan test_size kita

0.25000552962774547

**Selamat!** - Anda sudah memiliki data train & test

> Selanjutnya, hanya **fokus** ke data **training**

### **Data Imputation**

- Proses pengisian data yang kosong (NaN)
- Ada 2 hal yang diperhatikan:
  - Numerical Imputation
  - Categorical Imputation

**Cek data yang kosong dari variabel input**

In [27]:
X_train.isnull().sum()

# Output: nama variabel, True/False.
# Jika True, maka ada data yang kosong

# Ada 2500-2700 data yang kosong

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

**Bedakan antara data categorical & numerical**

In [28]:
X_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
37156,35,management,single,tertiary,no,2749,no,no,cellular,13,may,127,1,-1,0,unknown
20494,30,management,single,tertiary,no,443,yes,yes,cellular,12,aug,80,2,-1,0,unknown
35272,39,management,single,tertiary,no,4239,yes,no,cellular,7,may,40,1,-1,0,unknown
22260,49,services,married,secondary,no,400,no,no,cellular,21,aug,151,3,-1,0,unknown
2728,28,technician,single,secondary,no,468,yes,no,unknown,13,may,152,3,-1,0,unknown


Data kategorikal:
- job
- marital
- education
- default
- housing
- loan
- contact
- month
- poutcome

Sisanya adalah numerical

**Numerical Imputation**

In [29]:
X_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

In [30]:
# Buat kolom numerik
numerical_column = ["age", "balance", "day", "duration", 
                    "campaign", "pdays", "previous"]

In [31]:
# Seleksi dataframe numerik
X_train_numerical = X_train[numerical_column]

In [32]:
X_train_numerical.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
37156,35,2749,13,127,1,-1,0
20494,30,443,12,80,2,-1,0
35272,39,4239,7,40,1,-1,0
22260,49,400,21,151,3,-1,0
2728,28,468,13,152,3,-1,0


**Cek apakah ada data numerik yang kosong**

In [33]:
X_train_numerical.isnull().any()

# Semua variabel numerical memiliki missing values

age         False
balance     False
day         False
duration    False
campaign    False
pdays       False
previous    False
dtype: bool

**Gunakan imputer dari sklearn untuk data imputation numerik saja**

In [34]:
from sklearn.impute import SimpleImputer

In [35]:
imputer = SimpleImputer(missing_values = np.nan,
                        strategy = "median")

# namakan function SimpleImputer menjadi imputer, jangan lupa tanda kurung ()
# missing_values adalah tanda missing values dalam data.
#   - bisa NaN, bisa 999, bisa "KOSONG"
# Strategy median adalah strategy imputasi, 
# jika data kosong, diganti dengan median target
# Strategi lainnya adalah: mean

- `fit` : imputer agar mengetahui mean atau median dari tiap kolom
- `transform` : isi data dengan median atau mean
- output dari transform adalah pandas dataframe
- namakan kolom `X_train_numerical_imputed` sesuai dengan `X_train_numerical`.
   - MENGAPA? karena kita kehilangan nama kolom setelah data imputation
- beri index dari `X_train_numerical_imputed` sesuai dengan `X_train_numerical`.
   - MENGAPA? karena kita kehilangan index setelah data imputation

In [36]:
# Isi perintah yang akan dibuat dalam fungsi

# Fit imputer
imputer.fit(X_train_numerical)

# Transform
imputed_data = imputer.transform(X_train_numerical)
X_train_numerical_imputed = pd.DataFrame(imputed_data)

X_train_numerical_imputed.columns = X_train_numerical.columns
X_train_numerical_imputed.index = X_train_numerical.index

In [37]:
X_train_numerical_imputed.isnull().any()

age         False
balance     False
day         False
duration    False
campaign    False
pdays       False
previous    False
dtype: bool

**Mari buat dalam fungsi**

In [38]:
from sklearn.impute import SimpleImputer

def numericalImputation(data, numerical_column):
    """
    Fungsi untuk melakukan imputasi data numerik
    :param data: <pandas dataframe> sample data input
    :param numerical_column: <list> list kolom numerik data
    :return X_train_numerical: <pandas dataframe> data numerik
    :return imputer_numerical: numerical imputer method
    """
    # Filter data numerik
    numerical_data = data[numerical_column]

    # Buat imputer
    imputer_numerical = SimpleImputer(missing_values = np.nan,
                                      strategy = "median")
    imputer_numerical.fit(numerical_data)

    # Transform
    imputed_data = imputer_numerical.transform(numerical_data)
    numerical_data_imputed = pd.DataFrame(imputed_data)

    numerical_data_imputed.columns = numerical_column
    numerical_data_imputed.index = numerical_data.index

    return numerical_data_imputed, imputer_numerical

In [39]:
# Input
numerical_column = ["age", "balance", "day", "duration", 
                    "campaign", "pdays", "previous"]

# Imputation Numeric
X_train_numerical, imputer_numerical = numericalImputation(data = X_train,
                                                           numerical_column = numerical_column)

In [40]:
X_train_numerical.isnull().any()

age         False
balance     False
day         False
duration    False
campaign    False
pdays       False
previous    False
dtype: bool

**Categorical Imputation**

In [41]:
# Ambil daftar nama kolom kategorikal
# Anda bisa langsung menuliskannya atau mengambil list jika jumlahnya banyak

X_train_column = list(X_train.columns)
categorical_column = list(set(X_train_column).difference(set(numerical_column)))

In [42]:
categorical_column

['month',
 'poutcome',
 'education',
 'contact',
 'default',
 'marital',
 'loan',
 'job',
 'housing']

In [43]:
# Periksa lagi missing value
categorical_data = X_train[categorical_column]
categorical_data.isnull().sum()

month        0
poutcome     0
education    0
contact      0
default      0
marital      0
loan         0
job          0
housing      0
dtype: int64

In [44]:
# Kita isi kolom kategorik dengan "KOSONG"
categorical_data = X_train[categorical_column]
categorical_data = categorical_data.fillna(value="KOSONG")

In [45]:
categorical_data.isnull().sum()

month        0
poutcome     0
education    0
contact      0
default      0
marital      0
loan         0
job          0
housing      0
dtype: int64

**Mari buat dalam bentuk function**

In [46]:
def categoricalImputation(data, categorical_column):
    """
    Fungsi untuk melakukan imputasi data kategorik
    :param data: <pandas dataframe> sample data input
    :param categorical_column: <list> list kolom kategorikal data
    :return categorical_data: <pandas dataframe> data kategorikal
    """
    # seleksi data
    categorical_data = data[categorical_column]

    # lakukan imputasi
    categorical_data = categorical_data.fillna(value="KOSONG")

    return categorical_data


In [47]:
X_train_categorical = categoricalImputation(data = X_train,
                                            categorical_column = categorical_column)

In [48]:
X_train_categorical.isnull().sum()

month        0
poutcome     0
education    0
contact      0
default      0
marital      0
loan         0
job          0
housing      0
dtype: int64

### **Preprocessing Categorical Variables**

- Kita tidak bisa memasukkan data categorical, jika tidak diubah menjadi numerical
- Solusi: One Hot Encoding (OHE)

In [49]:
categorical_ohe = pd.get_dummies(X_train_categorical)

In [50]:
categorical_ohe.head(2)

Unnamed: 0,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,...,job_management,job_retired,job_self,job_services,job_student,job_technician,job_unemployed,job_unknown,housing_no,housing_yes
37156,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
20494,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


**Mari buat menjadi fungsi**

In [51]:
def extractCategorical(data, categorical_column):
    """
    Fungsi untuk ekstrak data kategorikal dengan One Hot Encoding
    :param data: <pandas dataframe> data sample
    :param categorical_column: <list> list kolom kategorik
    :return categorical_ohe: <pandas dataframe> data sample dengan ohe
    """
    data_categorical = categoricalImputation(data = data,
                                             categorical_column = categorical_column)
    categorical_ohe = pd.get_dummies(data_categorical)

    return categorical_ohe

In [52]:
X_train_categorical_ohe = extractCategorical(data = X_train,
                                             categorical_column = categorical_column)

In [53]:
X_train_categorical_ohe.head()

Unnamed: 0,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,...,job_management,job_retired,job_self,job_services,job_student,job_technician,job_unemployed,job_unknown,housing_no,housing_yes
37156,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
20494,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
35272,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
22260,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2728,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1


In [54]:
# Simpan kolom OHE untuk diimplementasikan dalam testing data
# Agar shape-nya konsisten
ohe_columns = X_train_categorical_ohe.columns

In [55]:
ohe_columns

Index(['month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'poutcome_failure', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown', 'education_primary',
       'education_secondary', 'education_tertiary', 'education_unknown',
       'contact_cellular', 'contact_telephone', 'contact_unknown',
       'default_no', 'default_yes', 'marital_divorced', 'marital_married',
       'marital_single', 'loan_no', 'loan_yes', 'job_admin', 'job_blue',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'housing_no', 'housing_yes'],
      dtype='object')

### **Join data Numerical dan Categorical**

- Data numerik & kategorik harus disatukan kembali
- Penyatuan dengan `pd.concat`

In [56]:
X_train_concat = pd.concat([X_train_numerical,
                            X_train_categorical_ohe],
                           axis = 1)

In [57]:
X_train_concat.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,month_apr,month_aug,month_dec,...,job_management,job_retired,job_self,job_services,job_student,job_technician,job_unemployed,job_unknown,housing_no,housing_yes
37156,35.0,2749.0,13.0,127.0,1.0,-1.0,0.0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
20494,30.0,443.0,12.0,80.0,2.0,-1.0,0.0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
35272,39.0,4239.0,7.0,40.0,1.0,-1.0,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
22260,49.0,400.0,21.0,151.0,3.0,-1.0,0.0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
2728,28.0,468.0,13.0,152.0,3.0,-1.0,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [58]:
X_train_concat.isnull().any()

age                    False
balance                False
day                    False
duration               False
campaign               False
pdays                  False
previous               False
month_apr              False
month_aug              False
month_dec              False
month_feb              False
month_jan              False
month_jul              False
month_jun              False
month_mar              False
month_may              False
month_nov              False
month_oct              False
month_sep              False
poutcome_failure       False
poutcome_other         False
poutcome_success       False
poutcome_unknown       False
education_primary      False
education_secondary    False
education_tertiary     False
education_unknown      False
contact_cellular       False
contact_telephone      False
contact_unknown        False
default_no             False
default_yes            False
marital_divorced       False
marital_married        False
marital_single

### **Standardizing Variables**

- Menyamakan skala dari variabel input
- `fit`: imputer agar mengetahui mean dan standar deviasi dari setiap kolom
- `transform`: isi data dengan value yang sudah dinormalisasi
- output dari transform berupa pandas dataframe
- normalize dikeluarkan karena akan digunakan pada data test

In [59]:
from sklearn.preprocessing import StandardScaler

# Buat fungsi
def standardizerData(data):
    """
    Fungsi untuk melakukan standarisasi data
    :param data: <pandas dataframe> sampel data
    :return standardized_data: <pandas dataframe> sampel data standard
    :return standardizer: method untuk standardisasi data
    """
    data_columns = data.columns  # agar nama kolom tidak hilang
    data_index = data.index  # agar index tidak hilang

    # buat (fit) standardizer
    standardizer = StandardScaler()
    standardizer.fit(data)

    # transform data
    standardized_data_raw = standardizer.transform(data)
    standardized_data = pd.DataFrame(standardized_data_raw)
    standardized_data.columns = data_columns
    standardized_data.index = data_index

    return standardized_data, standardizer

In [60]:
X_train_clean, standardizer = standardizerData(data = X_train_concat)

In [61]:
X_train_clean.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,month_apr,month_aug,month_dec,...,job_management,job_retired,job_self,job_services,job_student,job_technician,job_unemployed,job_unknown,housing_no,housing_yes
37156,-0.561073,0.457432,-0.338619,-0.510952,-0.566774,-0.408474,-0.239753,-0.264238,-0.398711,-0.070775,...,1.937442,-0.230241,-0.190879,-0.316986,-0.143697,-0.447672,-0.172244,-0.079882,1.118049,-1.118049
20494,-1.031904,-0.303457,-0.458602,-0.694422,-0.246145,-0.408474,-0.239753,-0.264238,2.508082,-0.070775,...,1.937442,-0.230241,-0.190879,-0.316986,-0.143697,-0.447672,-0.172244,-0.079882,-0.894415,0.894415
35272,-0.184408,0.949074,-1.058516,-0.850565,-0.566774,-0.408474,-0.239753,-0.264238,-0.398711,-0.070775,...,1.937442,-0.230241,-0.190879,-0.316986,-0.143697,-0.447672,-0.172244,-0.079882,-0.894415,0.894415
22260,0.757254,-0.317645,0.621244,-0.417266,0.074484,-0.408474,-0.239753,-0.264238,2.508082,-0.070775,...,-0.516145,-0.230241,-0.190879,3.15471,-0.143697,-0.447672,-0.172244,-0.079882,1.118049,-1.118049
2728,-1.220236,-0.295208,-0.338619,-0.413363,0.074484,-0.408474,-0.239753,-0.264238,-0.398711,-0.070775,...,-0.516145,-0.230241,-0.190879,-0.316986,-0.143697,2.233776,-0.172244,-0.079882,-0.894415,0.894415


## <b><font color='blue'> 3. Training Machine Learning:</font></b>
---
    * Choose Score to optimize and Hyperparameter Space
    * Cross-Validation: Random vs Grid Search CV
    * Kita harus mengalahkan benchmark

### **Benchmark / Baseline**

- Baseline untuk evaluasi nanti
- Karena ini klasifikasi, bisa kita ambil dari proporsi kelas target yang terbesar
- Dengan kata lain, menebak hasil output marketing response dengan nilai "no" semua tanpa modeling

In [62]:
y_train.value_counts(normalize = True)

# baseline akurasi = 88%

0    0.882624
1    0.117376
Name: y, dtype: float64

### **1. Import Model**

- Misal kita gunakan 3 model ML untuk klasifikasi:
    - K-nearest neighbor (K-NN)
    - Logistic Regression
    - Random Forest

In [63]:
# Import dari sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### **2. Fitting Model**

- Cara fitting/training model mengikuti yang dokumentasi model

In [64]:
# Model K nearest neighbor
#knn = KNeighborsClassifier()
#knn.fit(X_train_clean, y_train)

In [65]:
# Model Logistic Regression
logreg = LogisticRegression(random_state = 123)
logreg.fit(X_train_clean, y_train)

LogisticRegression(random_state=123)

In [66]:
# Model Random Forest Classifier
random_forest = RandomForestClassifier(random_state = 123)
random_forest.fit(X_train_clean, y_train)

RandomForestClassifier(random_state=123)

In [67]:
# Model Random Forest Classifier 1
# Mari kita ubah hyperparameter dari random forest --> n_estimator
# Maksud & tujuan akan dijelaskan pada kelas Random Forest
# Tambahkan n_estimator = 500

#random_forest_1 = RandomForestClassifier(random_state = 123,
#                                         n_estimators = 500)
#random_forest_1.fit(X_train_clean, y_train)

### **3. Prediction**

- Saatnya melakukan prediksi

In [68]:
# Prediksi Logistic Regression
logreg.predict(X_train_clean)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [69]:
predicted_logreg = pd.DataFrame(logreg.predict(X_train_clean))
predicted_logreg

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
33903,0
33904,0
33905,0
33906,0


In [70]:
#predicted_knn = pd.DataFrame(knn.predict(X_train_clean))
#predicted_knn.head()

In [71]:
predicted_rf = pd.DataFrame(random_forest.predict(X_train_clean))
predicted_rf.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [72]:
#predicted_rf_1 = pd.DataFrame(random_forest_1.predict(X_train_clean))
#predicted_rf_1.head()

### **4. Cek performa model di data training**

In [73]:
benchmark = y_train.value_counts(normalize=True)[0]
benchmark

0.8826235696590775

In [74]:
# akurasi knn
#knn.score(X_train_clean, y_train)

In [75]:
# akurasi logistic regression
logreg.score(X_train_clean, y_train)

0.9013212221304707

In [76]:
# akurasi random forest
random_forest.score(X_train_clean, y_train)

0.9999705084345877

In [77]:
# akurasi random forest 1
#random_forest_1.score(X_train_clean, y_train)

### **5. Simpan model ke file pickle**

In [78]:
import joblib

# Simpan model logreg ke dalam folder yang sama dengan notebook
# dengan nama logreg.pkl
joblib.dump(logreg, "logreg.pkl")

#joblib.dump(knn, "knn.pkl")
joblib.dump(random_forest, "random_forest.pkl")
#joblib.dump(random_forest_1, "random_forest_1.pkl")

['random_forest.pkl']

### **6. Test Prediction**

1. Siapkan file test dataset
2. Lakukan preprocessing yang sama dengan yang dilakukan di train dataset
3. gunakan `imputer_numerical` dan `standardizer` yang telah di-fit di train dataset

In [79]:
def extractTest(data,
                numerical_column, categorical_column, ohe_column,
                imputer_numerical, standardizer):
    """
    Fungsi untuk mengekstrak & membersihkan test data 
    :param data: <pandas dataframe> sampel data test
    :param numerical_column: <list> kolom numerik
    :param categorical_column: <list> kolom kategorik
    :param ohe_column: <list> kolom one-hot-encoding dari data kategorik
    :param imputer_numerical: <sklearn method> imputer data numerik
    :param standardizer: <sklearn method> standardizer data
    :return cleaned_data: <pandas dataframe> data final
    """
    # Filter data
    numerical_data = data[numerical_column]
    categorical_data = data[categorical_column]

    # Proses data numerik
    numerical_data = pd.DataFrame(imputer_numerical.transform(numerical_data))
    numerical_data.columns = numerical_column
    numerical_data.index = data.index

    # Proses data kategorik
    categorical_data = categorical_data.fillna(value="KOSONG")
    categorical_data.index = data.index
    categorical_data = pd.get_dummies(categorical_data)
    categorical_data.reindex(index = categorical_data.index, 
                             columns = ohe_column)

    # Gabungkan data
    concat_data = pd.concat([numerical_data, categorical_data],
                             axis = 1)
    cleaned_data = pd.DataFrame(standardizer.transform(concat_data))
    cleaned_data.columns = concat_data.columns

    return cleaned_data


In [80]:
def testPrediction(X_test, y_test, classifier, compute_score):
    """
    Fungsi untuk mendapatkan prediksi dari model
    :param X_test: <pandas dataframe> input
    :param y_test: <pandas series> output/target
    :param classifier: <sklearn method> model klasifikasi
    :param compute_score: <bool> True: menampilkan score, False: tidak
    :return test_predict: <list> hasil prediksi data input
    :return score: <float> akurasi model
    """
    if compute_score:
        score = classifier.score(X_test, y_test)
        print(f"Accuracy : {score:.4f}")

    test_predict = classifier.predict(X_test)

    return test_predict, score

In [81]:
X_test_clean = extractTest(data = X_test,
                           numerical_column = numerical_column,
                           categorical_column = categorical_column,
                           ohe_column = ohe_columns,
                           imputer_numerical = imputer_numerical,
                           standardizer = standardizer)

In [82]:
X_test_clean.shape

(11303, 51)

In [83]:
# Logistic Regression Performance
logreg_test_predict, score = testPrediction(X_test = X_test_clean,
                                            y_test = y_test,
                                            classifier = logreg,
                                            compute_score = True)

Accuracy : 0.9036


In [84]:
# K nearest neighbor Performance
#knn_test_predict, score = testPrediction(X_test = X_test_clean,
#                                         y_test = y_test,
#                                         classifier = knn,
#                                         compute_score = True)

In [85]:
# Random Forest Performance
rf_test_predict, score = testPrediction(X_test = X_test_clean,
                                        y_test = y_test,
                                        classifier = random_forest,
                                        compute_score = True)

Accuracy : 0.9045


In [86]:
# Random Forest 1 Performance
#rf_1_test_predict, score = testPrediction(X_test = X_test_clean,
#                                          y_test = y_test,
#                                          classifier = random_forest_1,
#                                          compute_score = True)  