In [66]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [67]:
def encoder(df):
    maps = None
    for col in df.columns:
        if df[col].dtype == 'object':
            labels = df[col].astype('category').cat.categories.tolist()
            replace_map_comp = {col : {k: v for k,v in zip(labels,list(range(0,len(labels))))}}
            maps = replace_map_comp
            df.replace(replace_map_comp, inplace=True)
            df[col].astype('float')
    return df , maps

In [68]:
def import_data(filename):
    balance_data = pd.read_csv( filename ,  sep= ',')
    attrq = ['a',  'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'j' , 'k',  'l',  'm', 'n']
    balance_data.columns =  attrq
    balance_data , maps = encoder(balance_data)
    balance_data = balance_data.sample(frac=1).reset_index(drop=True)
    balance_data = balance_data[['a',  'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i'  , 'k',  'l',  'm', 'n',  'j']]
    
    train_data = balance_data.iloc[:int(balance_data.shape[0]*0.8),:]
    validation_data = balance_data.iloc[int(balance_data.shape[0]*0.8) + 1:,:]
    
    train_data.columns = balance_data.columns
    validation_data.columns = balance_data.columns
    
    return train_data , validation_data , maps

In [69]:
train_data, val_data , maps = import_data("loan/data.csv")


In [70]:
print(train_data.head())

      a   b   c    d      e  f    g  h    i  k  l  m  n  j
0  3525  58  33   15  94583  4  0.9  2    0  0  0  0  0  0
1  2777  46  20  140  93106  2  6.3  1  380  0  0  1  1  0
2  3881  48  24   25  90024  4  0.5  2    0  0  0  0  0  0
3   389  54  30  100  95814  4  3.4  3    0  0  0  0  0  1
4   949  30   4   81  92037  1  2.9  3  259  0  0  1  1  0


In [77]:
class GBayes(object):
    def __init__():
        mean_0 = []
        sd_0 = []
        mean_1 = []
        sd_1 = []
        prob_0 = 0 
        prob_1 = 0 
        
    def fit(self , train_X,  train_Y):
        df = train_X.assign(target = train_Y)
        df0 =  df[df['target'] == 0]
        df1 = df[df['target'] == 1]
        self.mean_0 = list(df0.mean())
        self.sd_0 = list(df0.std())
        self.mean_1  = list(df1.mean())
        self.sd_1 = list(df1.std())
        del self.mean_0[-1]
        del self.sd_0[-1]
        del self.mean_1[-1]
        del self.sd_1[-1]
        self.prob_0 = float(df0.shape[0])/df.shape[0]
        self.prob_1 = float(df1.shape[0])/df.shape[0]

    def predict(self, val_X):
        val_0 = val_X.copy()
        val_1 = val_X.copy()
        val_0 = np.square(val_0.sub(list(self.mean_0),axis = "columns"))
        val_1 = np.square(val_1.sub(self.mean_1,  axis = "columns"))
        
        val_0 = val_0.mul(-1)
        val_1 = val_1.mul(-1)
        
        val_0 = np.exp(np.divide(val_0,  [float(i*i) for i  in self.sd_0]))
        val_1 = np.exp(np.divide(val_1 , [float(i*i) for i  in self.sd_1]))
        val_0 = np.divide(val_0, [i*4.4429 for i  in self.sd_0])
        val_1 = np.divide(val_1,  [i*4.4429 for i  in self.sd_1])
        val_0 = np.prod(val_0,  axis=1)*self.prob_0
        val_1 = np.prod(val_1 , axis=1)*self.prob_1

        
        prediction = []
        for i in range(val_0.shape[0]):
            if val_0.iloc[i,] > val_1.iloc[i,]:prediction.append(0)
            else : prediction.append(1)
                
        return prediction
        

In [72]:
def validate(train_df, val_df):
        
    train_X = train_df.iloc[:,:len(train_df.columns)-1]
    train_Y = train_df.iloc[:,-1]
    val_X = val_df.iloc[:,:len(val_df.columns) -1]
    val_Y = val_df.iloc[:,-1]
    
    model = GBayes()
    model.fit(train_X , train_Y)
    prediction = model.predict(val_X)
    
    conf_matrix = [[0]*2 for i  in range(2)]
    report  = None
    for i in range(len(prediction)):
        conf_matrix[val_Y.iloc[i,]][prediction[i]] += 1
    
    return conf_matrix
    

In [73]:
conf_matrix = validate(train_data , val_data)
print(conf_matrix)
tp = conf_matrix[0][0]
tn = conf_matrix[1][1]
fp = conf_matrix[1][0]
fn = conf_matrix[0][1]
tn = conf_matrix[1][1]
total = tp + tn + fp + fn
precision  = float(tp)/(tp + fp+0.001)
recall = float(tp)/(tp + fn +0.001)
f1_score = 2*precision*recall/(precision + recall)

print("acccuracy : " , float(tp + tn)/total)
print("precision : "+ str(precision))
print("recall : "+ str(recall))
print("F1 score : "+ str(f1_score))

[[740, 89], [17, 53]]
('acccuracy : ', 0.882091212458287)
precision : 0.977541641292
recall : 0.892640660265
F1 score : 0.933164018709


In [74]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def validate_using_sklearn(train_df , val_df):
    
    train_X = train_df.iloc[:,:len(train_df.columns)-1]
    train_Y = train_df.iloc[:,-1]
    val_X = val_df.iloc[:,:len(val_df.columns) -1]
    val_Y = val_df.iloc[:,-1]
    
    model = GaussianNB()
    model.fit(train_X,train_Y)
    prediction = model.predict(val_X)
    
    return confusion_matrix(val_Y , prediction) , classification_report(val_Y,prediction)


In [75]:
cm,cr = validate_using_sklearn(train_data, val_data)

In [76]:
print(cm)
print(cr)

[[762  67]
 [ 28  42]]
             precision    recall  f1-score   support

          0       0.96      0.92      0.94       829
          1       0.39      0.60      0.47        70

avg / total       0.92      0.89      0.90       899

