In [30]:
import pandas as pd
import numpy as np
import math
import operator
from sklearn import preprocessing

In [31]:
filename = 'project3_dataset1.txt'
dataframe = pd.read_csv(filename,delimiter='\t',header=None)

## Handling Categorical values

In [32]:
for x in range(dataframe.shape[1]):
        if(dataframe[x].dtypes == 'object'):
            dataframe[x] = dataframe[x].astype('category')
            dataframe[x] = dataframe[x].cat.codes

In [33]:
dataframe_orig = dataframe
dataframe = pd.DataFrame(preprocessing.normalize(dataframe.iloc[:,0:len(dataframe.columns)-1]))
dataframe[len(dataframe.columns)] = dataframe_orig.iloc[:,len(dataframe.columns):]

In [34]:
dataframe.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0.009344,0.013114,0.060903,0.585355,4.5e-05,4.8e-05,6.7e-05,4.5e-05,8.1e-05,2.6e-05,...,0.017756,0.071951,0.803528,5.4e-05,8.9e-05,0.000149,7.6e-05,0.000119,3.1e-05,1
1,0.013945,0.017586,0.092486,0.534398,0.000143,0.00019,0.000177,9.1e-05,0.000234,8.5e-05,...,0.026602,0.115818,0.830664,0.000201,0.000588,0.0006,0.000195,0.000446,0.000139,1
2,0.018747,0.033232,0.122739,0.656151,0.000134,0.000197,0.000154,9.3e-05,0.000432,0.000104,...,0.037178,0.130315,0.728576,0.000171,0.000307,0.000257,0.000155,0.000492,0.000117,0
3,0.018393,0.024765,0.117567,0.643353,0.000155,0.000111,5.6e-05,4.6e-05,0.000279,9.4e-05,...,0.035434,0.128622,0.743444,0.000191,0.000265,0.000225,0.000136,0.000431,0.000109,0
4,0.013505,0.014503,0.085667,0.614589,7e-05,3.2e-05,1.3e-05,1.7e-05,0.000151,4.9e-05,...,0.020104,0.095853,0.777179,9.3e-05,6.6e-05,4.4e-05,5.4e-05,0.000234,5.3e-05,0
5,0.012577,0.011691,0.084037,0.577521,8.8e-05,0.000175,0.00017,8e-05,0.000207,5.8e-05,...,0.015643,0.102566,0.804216,0.000114,0.000488,0.000517,0.000196,0.000383,8.2e-05,1
6,0.018166,0.026523,0.116422,0.665528,0.000187,0.00011,9.1e-05,0.000113,0.000308,9e-05,...,0.029014,0.121402,0.721752,0.000208,0.000128,0.000109,0.000137,0.000339,9.2e-05,0
7,0.014765,0.025524,0.095075,0.633546,8.6e-05,7.3e-05,5e-05,2.5e-05,0.00019,5.8e-05,...,0.028284,0.105233,0.758108,0.000101,0.000155,0.000146,6.6e-05,0.000286,6.8e-05,0
8,0.011752,0.020059,0.07637,0.558434,8e-05,7.6e-05,9.6e-05,5.1e-05,0.000128,5e-05,...,0.025888,0.094449,0.818247,0.000129,0.000184,0.000314,0.000119,0.000207,7.4e-05,1
9,0.008901,0.012227,0.059078,0.53674,4.3e-05,7.9e-05,7.7e-05,3.5e-05,8.6e-05,2.9e-05,...,0.014273,0.074552,0.837443,7e-05,0.000304,0.000281,8.2e-05,0.00017,5.2e-05,1


## Data Preparation

In [35]:
def prepareData(k,dataframe,fold_no):
    length = int(len(dataframe)/k)
    test = pd.DataFrame(dataframe,index=range(fold_no*length,fold_no*length + length))
    train = dataframe.loc[~dataframe.index.isin(test.index)]
    return train,test   

## Naive Bayes Code

In [36]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [57]:
def mean(numbers):
    return np.mean(numbers)

In [58]:
def stdev(numbers):
    return np.std(numbers)

In [59]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [60]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [61]:
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [62]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [63]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [64]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

## Calculate Scores

In [65]:
def calcScores(TP,TN,FP,FN):
    accuracy = (float(TP) + TN) / (TP + TN + FP + FN)
    precision = (float(TP) / (TP + FP))
    f_score = (2.0 * TP) / (2 * TP + FP + FN)
    recall = (float(TP)) / (TP + FN)
    scores = [accuracy,precision,f_score,recall]
    print("Accuracy: " + str(accuracy) + "\tPrecision: " + str(precision))
    print("F-Score: " + str(f_score) + "\tRecall: " + str(recall)+ "\n")
    return scores

## Build Confusion matrix

In [66]:
def buildConfusionMatrix(testSet, predictions):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for x in range(len(testSet) - 1):
        if testSet[x][-1] == predictions[x] and testSet[x][-1] == 1.0:
            TP += 1
        elif testSet[x][-1] == predictions[x] and testSet[x][-1] == 0.0:
            TN += 1
        elif testSet[x][-1] != predictions[x] and testSet[x][-1] == 1.0:
            FN += 1
        else:
            FP += 1
    return ([TP, TN, FP, FN])

## Evaluate Algorithm

In [168]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, n_folds, *args):
    folds_scores = dict()
    counter = 0
    sumAccuracy = 0
    sumPrecision = 0
    sumFScore = 0 
    sumRecall = 0
    for x in range(n_folds):
        train_set,test_set = prepareData(n_folds,dataset,x)
        train_set = train_set.values.tolist()
        test_set = test_set.values.tolist()
        test = pd.DataFrame(test_set)
        test = test.values.tolist()
        print('Train set: ' + repr(len(train_set)))
        print('Test set: ' + repr(len(test_set)) + '\n')    
        for y in range(len(test_set)):
            summaries = summarizeByClass(train_set)
            predictions = getPredictions(summaries, test_set)
        [TP, TN, FP, FN] = buildConfusionMatrix(test_set, predictions)
        print('Scores for fold ' + str(x+1) + ' are: \n')
        folds_scores[counter] = calcScores(TP,TN,FP,FN)
        counter += 1
    for z in range(counter):
        sumAccuracy += folds_scores[z][0]
        sumPrecision += folds_scores[z][1]
        sumFScore += folds_scores[z][2]
        sumRecall += folds_scores[z][3]
    print('Average accuracy is: ' + repr(sumAccuracy/n_folds))
    print('Average precision is: ' + repr(sumPrecision/n_folds))
    print('Average f-score is: ' + repr(sumFScore/n_folds))
    print('Average recall is: ' + repr(sumRecall/n_folds))
    return folds_scores

In [169]:
def main():
    n_folds = 10
    folds_scores = evaluate_algorithm(dataframe, n_folds)

In [170]:
main()

Train set: 513
Test set: 56

Scores for fold 1 are: 

Accuracy: 0.8	Precision: 0.6774193548387096
F-Score: 0.7924528301886793	Recall: 0.9545454545454546

Train set: 513
Test set: 56

Scores for fold 2 are: 

Accuracy: 0.7818181818181819	Precision: 0.6153846153846154
F-Score: 0.7272727272727273	Recall: 0.8888888888888888

Train set: 513
Test set: 56

Scores for fold 3 are: 

Accuracy: 0.7454545454545455	Precision: 0.45
F-Score: 0.5625	Recall: 0.75

Train set: 513
Test set: 56

Scores for fold 4 are: 

Accuracy: 0.7636363636363637	Precision: 0.6206896551724138
F-Score: 0.7346938775510204	Recall: 0.9

Train set: 513
Test set: 56

Scores for fold 5 are: 

Accuracy: 0.8	Precision: 0.6666666666666666
F-Score: 0.7843137254901961	Recall: 0.9523809523809523

Train set: 513
Test set: 56

Scores for fold 6 are: 

Accuracy: 0.7818181818181819	Precision: 0.6666666666666666
F-Score: 0.7272727272727273	Recall: 0.8

Train set: 513
Test set: 56

Scores for fold 7 are: 

Accuracy: 0.8545454545454545	Pre

References: https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/