In [1]:
import pandas as pd
import numpy as np
import math
from pprint import pprint

In [2]:
df = pd.read_csv("Dataset/LoanDataset/data.csv", header=None)
df = df.iloc[1:, 1:]
train = df.sample(frac=0.8, random_state=200)
validation = df.drop(train.index)

In [3]:
def mean(data):
    if len(data) != 0:
        return float(sum(data))/len(data)
    return 0

def std(data):
    return np.std(data)

def pdf(n, m, s):
    sd = math.pow(float(s),2)
    denominator = math.pow(2 * math.pi * sd, 0.5)
    numerator = math.exp(-1*math.pow(float(n)-float(m), 2)/(2*sd))
    return numerator/denominator

def process_numerical(data, categories, label):
    output = {}
    for cat in categories:
        output[cat] = {}
        output[cat]['mean'] = {}
        output[cat]['std'] = {}
        for val in data[label].unique():
            output[cat]['mean'][val] = mean(data[cat][data[label] == val])
            output[cat]['std'][val] = std(data[cat][data[label] == val])
    return output

def process_categorical(data, categories, label):
    output = {}
    for cat in categories:
        output[cat] = {}
        for num in data[cat].unique():
            output[cat][num] = {}
            for val in data[label].unique():
                numerator = len(data[cat][data[cat] == num][data[label] == val])
                denominator = len(data[label][data[label] == val])
                output[cat][num][val] = (float(numerator)/denominator)
    return output

def stats(predicted, actual):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(actual)):
        if predicted[i] == actual[i] and predicted[i] == 1.0:
            tp += 1
        elif predicted[i] == actual[i] and predicted[i] == 0.0:
            tn += 1
        elif predicted[i] != actual[i] and predicted[i] == 1.0:
            fp += 1
        else:
            fn += 1
    
    total = float(tp) + float(tn) + float(fn) + float(fp)
    accuracy = (float(tp) + float(tn))/total
    precision = (float(tp))/(float(tp) + float(fp))
    recall = (float(tp))/(float(tp) + float(fn))
    f1 = 2/((1/precision) + (1/recall))
    print "Accuracy: ", accuracy
    print "Precision: ", precision
    print "Recall: ", recall
    print "F1: ", f1
            

In [4]:
def nb(train, validation, label):
    numerical = [1, 2, 3, 5, 6, 8]
    categorical = [7, 10, 11, 12, 13]
    processed_categorical = process_categorical(train, categorical, label)
    processed_numerical = process_numerical(train, numerical, label)
    
#     print "Categorical: "
#     pprint(processed_categorical)
    
#     print "Numerical: "
#     pprint(processed_numerical)
    
    yes = float(len(train[train[label] == 1.0]))/len(train[label])
    no = float(len(train[train[label] == 0.0]))/len(train[label])
    
    predicted = []
    
    for index, row in validation.iterrows():
        prob_pos = yes
        prob_neg = no
        
        for i in range(1, 13):
            if i == 4 or i == label:
                continue
            if i in numerical:
                prob_pos *= pdf(row[i], processed_numerical[i]['mean'][1.0], processed_numerical[i]['std'][1.0])
                prob_neg *= pdf(row[i], processed_numerical[i]['mean'][0.0], processed_numerical[i]['std'][0.0])
            else:
                prob_pos *= processed_categorical[i][row[i]][1.0]
                prob_neg *= processed_categorical[i][row[i]][0.0]
                
        if prob_pos > prob_neg:
            predicted.append(1.0)
        else:
            predicted.append(0.0)
            
    actual = validation[label].tolist()
    
    stats(predicted, actual)
    
nb(train, validation, 9)

Accuracy:  0.927777777778
Precision:  0.59
Recall:  0.710843373494
F1:  0.644808743169
