In [0]:
import numpy as np
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [0]:
# Read the dataset
data = []
with open('spambase.data') as inputFile:
  data = inputFile.readlines()

In [0]:
# preprocess the data
dataSplit = [line.split(",") for line in data]
spamData = [[float(d) for d in datapoint] for datapoint in dataSplit]

# split to features and labels
X = [datapoint[:-1] for datapoint in spamData]
y = [datapoint[-1] for datapoint in spamData]

In [0]:
#Create a Gaussian Classifier
model = GaussianNB()

In [0]:
def k_fold_generator(X, y, k_fold):
    subset_size = len(X) // k_fold  # Cast to int if using Python 3
    for k in range(k_fold):
        X_train = X[:k * subset_size] + X[(k + 1) * subset_size:]
        X_test = X[k * subset_size:][:subset_size]
        y_train = y[:k * subset_size] + y[(k + 1) * subset_size:]
        y_test = y[k * subset_size:][:subset_size]

        yield X_train, y_train, X_test, y_test


In [80]:
Perform k-fold cross validation for k=10
k_fold = 10
print("Numer of samples in each k-fold : ", len(X)//k_fold)
spamMetrics = []
k = 1
# Train the model using the training sets using k-fold cross validation
for X_train, y_train, X_test, y_test in k_fold_generator(X, y, k_fold):
  X_train, y_train, X_test, y_test = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test) 
  model.fit(X_train,y_train)
  
  #Predict the response for test dataset
  y_pred = model.predict(X_test)
  
  # false positives, false negatives, overall error rates 
  FP = 0
  FN = 0
  OE = 0
  for i in range(len(y_pred)):
    if y_pred[i]!=y_test[i]:
      OE += 1
      if y_pred[i]==1:
        FP += 1
      else:
        FN += 1
  FP=FP/len(y_pred)
  FN=FN/len(y_pred)
  OE=OE/len(y_pred)
  spamMetrics.append([FP, FN, OE])
  
  # Model Accuracy, how often is the classifier correct?
  print("Accuracy for k=", k, " :",metrics.accuracy_score(y_test, y_pred), "FP : ", FP, ", FN : ", FN, ", OE : ", OE)
  k += 1

avgFP, avgFN, avgOE = 0, 0, 0
for v in spamMetrics:
  avgFP+=v[0]
  avgFN+=v[1]
  avgOE+=v[2]
  
avgFP = avgFP/k_fold
avgFN = avgFN/k_fold
avgOE = avgOE/k_fold

spamMetrics.append([avgFP, avgFN, avgOE])


Numer of samples in each k-fold :  460
Accuracy for k= 1  : 0.9739130434782609 FP :  0.0 , FN :  0.02608695652173913 , OE :  0.02608695652173913
Accuracy for k= 2  : 0.9652173913043478 FP :  0.0 , FN :  0.034782608695652174 , OE :  0.034782608695652174
Accuracy for k= 3  : 0.967391304347826 FP :  0.0 , FN :  0.03260869565217391 , OE :  0.03260869565217391
Accuracy for k= 4  : 0.9021739130434783 FP :  0.015217391304347827 , FN :  0.08260869565217391 , OE :  0.09782608695652174
Accuracy for k= 5  : 0.7782608695652173 FP :  0.2217391304347826 , FN :  0.0 , OE :  0.2217391304347826
Accuracy for k= 6  : 0.8173913043478261 FP :  0.1826086956521739 , FN :  0.0 , OE :  0.1826086956521739
Accuracy for k= 7  : 0.8 FP :  0.2 , FN :  0.0 , OE :  0.2
Accuracy for k= 8  : 0.7391304347826086 FP :  0.2608695652173913 , FN :  0.0 , OE :  0.2608695652173913
Accuracy for k= 9  : 0.7673913043478261 FP :  0.2326086956521739 , FN :  0.0 , OE :  0.2326086956521739
Accuracy for k= 10  : 0.46304347826086956 FP

In [81]:
print(" ************** EVALUATING THE RESULTS ***************")

print("_"*65)
print("| Fold number | False Positive | False Negative | Overall Error |")
print("_"*65)
for k in range(k_fold):
  print("|     % 3d     |       % 1.5f |       % 5.5f |      % 1.5f |" %(k+1, spamMetrics[k][0], spamMetrics[k][1], spamMetrics[k][2]))
print("_"*65)
print("|     Avg     |       % 1.5f |       % 5.5f |      % 1.5f |" %( spamMetrics[k][0], spamMetrics[k][1], spamMetrics[k][2]))
print("_"*65)


 ************** EVALUATING THE RESULTS ***************
_________________________________________________________________
| Fold number | False Positive | False Negative | Overall Error |
_________________________________________________________________
|      1      |        0.00000 |        0.02609 |       0.02609 |
|      2      |        0.00000 |        0.03478 |       0.03478 |
|      3      |        0.00000 |        0.03261 |       0.03261 |
|      4      |        0.01522 |        0.08261 |       0.09783 |
|      5      |        0.22174 |        0.00000 |       0.22174 |
|      6      |        0.18261 |        0.00000 |       0.18261 |
|      7      |        0.20000 |        0.00000 |       0.20000 |
|      8      |        0.26087 |        0.00000 |       0.26087 |
|      9      |        0.23261 |        0.00000 |       0.23261 |
|     10      |        0.53696 |        0.00000 |       0.53696 |
_________________________________________________________________
|     Avg     |      