# Naive_Bayes

## On Credit_Risk Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
cr = pd.read_csv("CreditRisk.csv")

In [4]:
cr.isnull().sum()

Loan_ID               0
Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
cr.Gender = cr.Gender.fillna("Male")
cr.Married = cr.Married.fillna("No")
cr.Dependents = cr.Dependents.fillna(0)
cr.Self_Employed = cr.Self_Employed.fillna("No")
cr.LoanAmount = cr.LoanAmount.fillna(cr.LoanAmount.mean())
cr.Loan_Amount_Term = cr.Loan_Amount_Term.fillna(cr.Loan_Amount_Term.mean())
cr.Credit_History = cr.Credit_History.fillna(0)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
cr.Gender = le.fit_transform(cr.Gender)
cr.Married = le.fit_transform(cr.Married)
cr.Education = le.fit_transform(cr.Education)
cr.Self_Employed = le.fit_transform(cr.Self_Employed)
cr.Property_Area = le.fit_transform(cr.Property_Area)

In [8]:
cr.Loan_Status = cr.Loan_Status.replace({ "Y" :1 , "N" :0 })

In [9]:
cr.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0.0,0,0,5849,0.0,142.51153,360.0,1.0,2,1
1,LP001003,1,1,1.0,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0.0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0.0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0.0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [10]:
cr_x = cr.iloc[ : , 1:12]
cr_y = cr.iloc[ : , 12]

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
cr_x_train, cr_x_test, cr_y_train, cr_y_test = train_test_split(cr_x,cr_y, test_size = 0.2, random_state = 555)

In [13]:
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()

In [14]:
NB.fit(cr_x_train, cr_y_train)
pred_nb = NB.predict(cr_x_test)

In [15]:
from sklearn.metrics import confusion_matrix

tab_nb = confusion_matrix(pred_nb, cr_y_test)
tab_nb

array([[23, 59],
       [38, 77]], dtype=int64)

In [16]:
cr_acc = tab_nb.diagonal().sum() / tab_nb.sum() * 100
cr_acc

50.76142131979695

## On CTG Data

In [17]:
ctg = pd.read_csv("CTG.csv")

In [18]:
ctg.NSP.value_counts()

1    1655
2     295
3     176
Name: NSP, dtype: int64

In [19]:
ctg_x = ctg.iloc[ : , [0,1,2] ]
ctg_y = ctg.iloc[ : , 3]

In [20]:
ctg_x_train, ctg_x_test, ctg_y_train, ctg_y_test = train_test_split(ctg_x,ctg_y, test_size = 0.2)

In [21]:
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()

In [22]:
NB.fit(ctg_x_train,ctg_y_train) # Model Building
perd_ctg = NB.predict(ctg_x_test)

In [23]:
from sklearn.metrics import confusion_matrix

tab_nb = confusion_matrix(perd_ctg, ctg_y_test)
tab_nb

array([[335,  62,  29],
       [  0,   0,   0],
       [  0,   0,   0]], dtype=int64)

In [24]:
cr_acc = tab_nb.diagonal().sum() / tab_nb.sum() * 100
cr_acc

78.63849765258216

## Class balancing by Manually

In [25]:
ctg = pd.read_csv("CTG.csv")

In [26]:
ctg_train, ctg_test = train_test_split(ctg, test_size = 0.2)

In [27]:
print(ctg.shape)
print(ctg_train.shape)
print(ctg_test.shape)

(2126, 4)
(1700, 4)
(426, 4)


In [28]:
ctg_train.NSP.value_counts()  # Class imbalance is there

1    1325
2     235
3     140
Name: NSP, dtype: int64

In [29]:
ctg_2 = ctg[ctg.NSP == 2]
ctg_2
ctg_3 = ctg[ctg.NSP == 3]
ctg_3

Unnamed: 0,LB,AC,FM,NSP
5,134,0.001,0.0,3
6,134,0.001,0.0,3
7,122,0.0,0.0,3
8,122,0.0,0.0,3
9,122,0.0,0.0,3
20,129,0.0,0.34,3
22,128,0.0,0.334,3
23,128,0.0,0.0,3
24,128,0.0,0.0,3
25,124,0.0,0.0,3


In [30]:
ctg_train_new = pd.concat([ctg_train, ctg_2, ctg_2, ctg_3, ctg_3, ctg_3, ctg_3, ctg_3, ctg_3, ctg_3])

In [31]:
ctg_train_new.shape

(3522, 4)

In [32]:
ctg_train_new.NSP.value_counts()

3    1372
1    1325
2     825
Name: NSP, dtype: int64

In [33]:
ctg_train_new_x = ctg_train_new.iloc[ : , [0,1,2]]
ctg_train_new_y = ctg_train_new.iloc[ : , 3]

ctg_test_x  = ctg_test.iloc[ : , [0,1,2]]
ctg_test_y = ctg_test.iloc[ : , 3]

In [34]:
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()

In [35]:
NB.fit(ctg_train_new_x,ctg_train_new_y) # Model Building
perd_ctg = NB.predict(ctg_test_x)

In [36]:
from sklearn.metrics import confusion_matrix

tab_nb = confusion_matrix(perd_ctg, ctg_test_y)
tab_nb

array([[  5,   0,   0],
       [  0,   0,   0],
       [325,  60,  36]], dtype=int64)

In [37]:
cr_acc = tab_nb.diagonal().sum() / tab_nb.sum() * 100
cr_acc

9.624413145539906