# Implementasi Naive Bayes Studi Kasus Peminjaman Uang (Tugas 8)

1. Baca data credit score

2. Buat data tes & data latih.

3. Implementasikan salah satu algoritma berikut untuk menentukan kelas dari sebuah data

* Gaussian Naive Bayes

* K - Nearest Neighbors

* Decision Tree

Tampilkan accuracy, precision, & recall dari setiap algoritma

## Persiapan Environment

* Impor pandas library

* Impor train_test_split dari sklearn

* Impor gaussian naive bayes dari sklearn

* Impor library tambahan yang dibutuhkan dari sklearn

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('https://raw.githubusercontent.com/andreandriand/dataset/main/credit_score.csv')
data

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,kpr_aktif,durasi_pinjaman_bulan,jumlah_tanggungan,rata_rata_overdue,risk_rating
0,1,AGR-000001,295,YA,48,5,61 - 90 days,4
1,2,AGR-000011,271,YA,36,5,61 - 90 days,4
2,3,AGR-000030,159,TIDAK,12,0,0 - 30 days,1
3,4,AGR-000043,210,YA,12,3,46 - 60 days,3
4,5,AGR-000049,165,TIDAK,36,0,31 - 45 days,2
...,...,...,...,...,...,...,...,...
895,896,AGR-010739,112,YA,48,5,> 90 days,5
896,897,AGR-010744,120,YA,48,2,46 - 60 days,3
897,898,AGR-010758,166,TIDAK,24,2,0 - 30 days,1
898,899,AGR-010775,196,TIDAK,48,0,31 - 45 days,2


In [None]:
### Modelling 
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Remove unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# select the class feature
labels = data["risk_rating"]

In [None]:
# create a dataframe with all training data except the target column
X = data.drop(columns=["risk_rating"])

# check that the target variable has been removed
X.head()

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,kpr_aktif,durasi_pinjaman_bulan,jumlah_tanggungan,rata_rata_overdue
0,1,AGR-000001,295,YA,48,5,61 - 90 days
1,2,AGR-000011,271,YA,36,5,61 - 90 days
2,3,AGR-000030,159,TIDAK,12,0,0 - 30 days
3,4,AGR-000043,210,YA,12,3,46 - 60 days
4,5,AGR-000049,165,TIDAK,36,0,31 - 45 days


## Mengubah Fitur "rata_rata_overdue" Menjadi Tipe Data Numerik

Split kolom numeric berdasarkan "range days"

In [None]:
# Range Days
['>90', '0-30', '31-45', '46-60', '61-90']

['>90', '0-30', '31-45', '46-60', '61-90']

In [None]:
split_overdue_X = pd.get_dummies(X["rata_rata_overdue"], prefix="overdue")
X = X.join(split_overdue_X)

X = X.drop(columns = "rata_rata_overdue")

## Normalisasi Fitur "kpr_aktif"

Split kolom numerik berdasarkan 'yes' dan 'no'

In [None]:
# KPR values
['yes', 'no']

['yes', 'no']

In [None]:
# 
KPR_status = pd.get_dummies(X["kpr_aktif"], prefix="KPR")
X = X.join(KPR_status)

# remove "rata_rata_overdue" feature
X = X.drop(columns = "kpr_aktif")

In [None]:
X

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,durasi_pinjaman_bulan,jumlah_tanggungan,overdue_0 - 30 days,overdue_31 - 45 days,overdue_46 - 60 days,overdue_61 - 90 days,overdue_> 90 days,KPR_TIDAK,KPR_YA
0,1,AGR-000001,295,48,5,0,0,0,1,0,0,1
1,2,AGR-000011,271,36,5,0,0,0,1,0,0,1
2,3,AGR-000030,159,12,0,1,0,0,0,0,1,0
3,4,AGR-000043,210,12,3,0,0,1,0,0,0,1
4,5,AGR-000049,165,36,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
895,896,AGR-010739,112,48,5,0,0,0,0,1,0,1
896,897,AGR-010744,120,48,2,0,0,1,0,0,0,1
897,898,AGR-010758,166,24,2,1,0,0,0,0,1,0
898,899,AGR-010775,196,48,0,0,1,0,0,0,1,0


## Normalisasi Fitur "pendapatan_setahun_juta", "durasi_pinjaman_bulan", dan "jumlah_tanggungan"

Scale fitur dengan range value antara 0 sampai 1

In [None]:
# features will be normalize
['pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan']

['pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan']

In [None]:
# normalize feature 'pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan'
old_normalize_feature_labels = ['pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan']
new_normalized_feature_labels = ['norm_pendapatan_setahun_juta', 'norm_durasi_pinjaman_bulan', 'norm_jumlah_tanggungan']
normalize_feature = data[old_normalize_feature_labels]

In [None]:
normalize_feature

Unnamed: 0,pendapatan_setahun_juta,durasi_pinjaman_bulan,jumlah_tanggungan
0,295,48,5
1,271,36,5
2,159,12,0
3,210,12,3
4,165,36,0
...,...,...,...
895,112,48,5
896,120,48,2
897,166,24,2
898,196,48,0


In [None]:
scaler = MinMaxScaler()
scaler.fit(normalize_feature)
MinMaxScaler()

MinMaxScaler()

In [None]:
normalized_feature = scaler.transform(normalize_feature)
normalized_feature_data = pd.DataFrame(normalized_feature, columns = new_normalized_feature_labels)
normalized_feature_data

Unnamed: 0,norm_pendapatan_setahun_juta,norm_durasi_pinjaman_bulan,norm_jumlah_tanggungan
0,0.978261,1.000000,0.833333
1,0.873913,0.666667,0.833333
2,0.386957,0.000000,0.000000
3,0.608696,0.000000,0.500000
4,0.413043,0.666667,0.000000
...,...,...,...
895,0.182609,1.000000,0.833333
896,0.217391,1.000000,0.333333
897,0.417391,0.333333,0.333333
898,0.547826,1.000000,0.000000


In [None]:
X = X.drop(columns = old_normalize_feature_labels)
X = X.join(normalized_feature_data)
X = X.join(labels)
X

Unnamed: 0.1,Unnamed: 0,kode_kontrak,overdue_0 - 30 days,overdue_31 - 45 days,overdue_46 - 60 days,overdue_61 - 90 days,overdue_> 90 days,KPR_TIDAK,KPR_YA,norm_pendapatan_setahun_juta,norm_durasi_pinjaman_bulan,norm_jumlah_tanggungan,risk_rating
0,1,AGR-000001,0,0,0,1,0,0,1,0.978261,1.000000,0.833333,4
1,2,AGR-000011,0,0,0,1,0,0,1,0.873913,0.666667,0.833333,4
2,3,AGR-000030,1,0,0,0,0,1,0,0.386957,0.000000,0.000000,1
3,4,AGR-000043,0,0,1,0,0,0,1,0.608696,0.000000,0.500000,3
4,5,AGR-000049,0,1,0,0,0,1,0,0.413043,0.666667,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,896,AGR-010739,0,0,0,0,1,0,1,0.182609,1.000000,0.833333,5
896,897,AGR-010744,0,0,1,0,0,0,1,0.217391,1.000000,0.333333,3
897,898,AGR-010758,1,0,0,0,0,1,0,0.417391,0.333333,0.333333,1
898,899,AGR-010775,0,1,0,0,0,1,0,0.547826,1.000000,0.000000,2


In [None]:
subject_lables = ["Unnamed: 0",  "kode_kontrak"]
X = X.drop(columns = subject_lables)
# percent_amount_of_test_data = / HUNDRED_PERCENT
percent_amount_of_test_data = 0.3
X

Unnamed: 0,overdue_0 - 30 days,overdue_31 - 45 days,overdue_46 - 60 days,overdue_61 - 90 days,overdue_> 90 days,KPR_TIDAK,KPR_YA,norm_pendapatan_setahun_juta,norm_durasi_pinjaman_bulan,norm_jumlah_tanggungan,risk_rating
0,0,0,0,1,0,0,1,0.978261,1.000000,0.833333,4
1,0,0,0,1,0,0,1,0.873913,0.666667,0.833333,4
2,1,0,0,0,0,1,0,0.386957,0.000000,0.000000,1
3,0,0,1,0,0,0,1,0.608696,0.000000,0.500000,3
4,0,1,0,0,0,1,0,0.413043,0.666667,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...
895,0,0,0,0,1,0,1,0.182609,1.000000,0.833333,5
896,0,0,1,0,0,0,1,0.217391,1.000000,0.333333,3
897,1,0,0,0,0,1,0,0.417391,0.333333,0.333333,1
898,0,1,0,0,0,1,0,0.547826,1.000000,0.000000,2


## Hitung Data

* Pisahkan kolom “risk_rating” dari dataframe

* Ambil kolom “risk_rating” sebagai target kolom untuk kategori kelas

* Pisahkan data latih dengan data tes

In [None]:
# data latih (nilai data)
X_train 

# data tes (nilai data)
X_test 

# data latih (kelas data)
y_train

# data tes (kelas data)
y_test

array([2, 1, 3, 2, 4, 4, 1, 3, 3, 5, 1, 2, 5, 2, 3, 3, 1, 3, 3, 4, 2, 3,
       3, 1, 5, 3, 3, 3, 3, 3, 2, 3, 5, 3, 1, 4, 4, 4, 4, 2, 4, 1, 1, 2,
       5, 3, 5, 2, 1, 1, 2, 5, 1, 1, 2, 5, 1, 3, 3, 3, 4, 2, 3, 5, 5, 3,
       3, 3, 4, 1, 5, 4, 2, 1, 1, 4, 3, 3, 3, 5, 1, 2, 3, 2, 4, 3, 1, 3,
       2, 1, 2, 3, 2, 2, 3, 1, 2, 5, 5, 1, 1, 1, 3, 1, 5, 4, 3, 5, 2, 2,
       3, 3, 1, 1, 2, 1, 4, 4, 2, 2, 5, 4, 3, 1, 4, 1, 3, 1, 1, 5, 1, 1,
       1, 3, 3, 2, 5, 3, 1, 4, 3, 5, 3, 5, 2, 3, 1, 2, 2, 3, 3, 3, 3, 4,
       5, 3, 3, 4, 3, 1, 2, 1, 1, 1, 2, 1, 3, 1, 4, 2, 1, 1, 4, 2, 2, 1,
       1, 1, 3, 3, 3, 4, 5, 4, 3, 1, 1, 3, 2, 2, 3, 3, 4, 2, 5, 3, 5, 1,
       3, 5, 2, 3, 2, 3, 3, 2, 3, 3, 3, 1, 1, 1, 3, 1, 1, 3, 2, 3, 2, 3,
       3, 1, 1, 1, 1, 5, 1, 3, 2, 1, 4, 1, 5, 4, 4, 3, 3, 3, 3, 3, 2, 3,
       1, 4, 4, 4, 3, 5, 3, 3, 1, 1, 4, 1, 5, 1, 5, 2, 1, 2, 2, 3, 4, 4,
       1, 2, 4, 2, 5, 3])

In [None]:
# separate target 

# values
matrices_X = X.iloc[:,0:10].values

# classes
matrices_Y = X.iloc[:,10].values

In [None]:
X_1 = X.iloc[:,0:10].values
Y_1 = X.iloc[:, -1].values

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(matrices_X, matrices_Y, test_size = percent_amount_of_test_data, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_1, Y_1, test_size = percent_amount_of_test_data, random_state=0)

### Contoh Data

In [None]:
# Data example 'Budi' customer
[0,	0,	0,	0,	0,	0,	1,	0.582609,	0.666667,	0]

[0, 0, 0, 0, 0, 0, 1, 0.582609, 0.666667, 0]

## Implementasi Gaussian Naive Bayes

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test) 
accuracy_nb=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')
recall =  recall_score(y_test, Y_pred,average='micro')
f1 = f1_score(y_test,Y_pred,average='micro')
print('Confusion matrix for Naive Bayes\n',cm)
print('accuracy_Naive Bayes: %.3f' %accuracy)
print('precision_Naive Bayes: %.3f' %precision)
print('recall_Naive Bayes: %.3f' %recall)
print('f1-score_Naive Bayes : %.3f' %f1)

Confusion matrix for Naive Bayes
 [[69  0  0  0  0]
 [ 0 49  0  0  0]
 [ 0  0 84  0  0]
 [ 0  0  0 36  0]
 [ 0  0  0  0 32]]
accuracy_Naive Bayes: 1.000
precision_Naive Bayes: 1.000
recall_Naive Bayes: 1.000
f1-score_Naive Bayes : 1.000
