In [1]:
import numpy as np
from math import log ,factorial
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score,recall_score,f1_score
from itertools import combinations
import matplotlib.pyplot as plt
%matplotlib inline
import io

In [2]:
filename = "spambase.data"
r = io.open(filename, encoding='utf8').readlines()
X = []
Y = []
for i in r:
    x = i.split(',')
    X.append(map(float,x[1:len(x)-1]))
    Y.append([int(x[-1])])
Y = np.array(Y)
X = np.array(X)

In [3]:
def binarizer(data):
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] != 0:
                data[i][j] = 1.0
    return data

In [4]:
def find_alpha(class_labels):
    alpha1 = {}
    class_count1 = {}
    for i in class_labels:
        if i[0] not in class_count1.keys():
            class_count1[i[0]] = 1
        else:
            class_count1[i[0]] += 1
    classes1 = class_count1.keys()
    for j in  class_count1:
        alpha1[j] = class_count1[j]*1.0/len(class_labels)
    return classes1,class_count1,alpha1

In [5]:
def segregate_data(data,labels,clabels):
    multi_X = {}
    for i in range(len(clabels)):
        if clabels[i] not in multi_X.keys():
            multi_X[clabels[i]] = []
        for j in range(len(labels)):
            if labels[j][0] == clabels[i]:
                multi_X[clabels[i]].append(data[j]) 
    for i in multi_X:
        multi_X[i] = np.array(multi_X[i])
    return multi_X

In [6]:
def find_mean_multi(data):
    multi_mean1 = {}
    multi_s ={}
    for i in data:
        multi_mean1[i] = np.mean(data[i],axis =0)
        multi_s[i] = np.sum(data[i],axis = 0)
    return multi_s,multi_mean1

In [7]:
def smoothing(msum,sdata,sv):
    smean = {}
    for i in msum:
        smean[i]= []
        for j in range(len(msum[i])):
            smean[i].append(float(msum[i][j]+sv)/(len(sdata[i])+2*sv))
    for i in smean:
        smean[i] = np.array(smean[i])
    return smean

In [8]:
def find_covariance_multi(data_split,split_mean,n):
    multi_intrim= {}
    covar = {}
    for i in data_split:
        if i not in multi_intrim.keys():
            multi_intrim[i] = []
        for j in data_split[i]:
            multi_intrim[i].append(j-split_mean[i])
    for i in multi_intrim:
        covar[i] = np.dot(np.array(multi_intrim[i]).transpose(),np.array(multi_intrim[i]))/float(n[i])
    return covar

In [9]:
def training(x,y,sv):
    cl,n,alph = find_alpha(y)
    data_seg = segregate_data(x,y,cl)
    mean_s, mean_cl = find_mean_multi(data_seg)
    sme = smoothing(mean_s,data_seg,sv)
    return alph,sme

In [10]:
def membership_naive(data,mean,prior):
    mem = []
    for i in mean:
        c = 0
        for j in range(len(mean[i])):
            if data[j] == 1:
                a = log(mean[i][j])
            elif data[j] == 0:
                a = log(1-mean[i][j])   
            c= c+a
        c = c+prior[i]
        mem.append(c)
    return mem


In [11]:
def determinist(data):
    return data.index(max(data))

In [12]:
def find_class(data,clabels):
    return clabels[data]

In [13]:
def prediction(data,mean,prior,clabels):
    member = []
    for i in data:
        member.append(membership_naive(i,mean,prior))
    determine =[]
    for j in member:
        determine.append(determinist(j))
    predict = [find_class(i,clabels) for i in determine]
    return predict

In [14]:
def mean_squrae_error(pred,y):
    return sum([(i-j)**2 for i,j in zip(pred,y)])/float(len(pred))

In [15]:
def find_confusion_matrix(clabels,actual,predicted):
    cm= []
    for i in clabels:
        tmp =[0]*len(clabels)
        for j in range(len(actual)):
            if actual[j][0] == i and actual[j][0] == predicted[j]:
                tmp[clabels.index(i)] += 1
            elif actual[j][0] == i and actual[j][0] != predicted[j]:
                tmp[clabels.index(predicted[j])] += 1
        cm.append(tmp)
    return np.array(cm)

In [16]:
def find_accuracy(matrix):
    return np.trace(matrix)*1.0/np.sum(matrix)

In [17]:
def find_precision(matrix):
    pres = []
    x = np.sum(matrix,axis=0)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                pres.append(matrix[i][j]*1.0/x[i])
    return pres

In [18]:
def find_recall(matrix):
    rec = []
    x = np.sum(matrix,axis=1)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                rec.append(matrix[i][j]*1.0/x[i])
    return rec

In [19]:
def find_fmeasure(prec,rec):
    tmp = []
    for i,j in zip(prec,rec):
        tmp.append(2.0*(i*j)/(i+j))
    return tmp

In [20]:
def roc(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    precision = find_precision(confmatrix)
    recall = find_recall(confmatrix)
    return precision,recall

In [32]:
def cross_validation(data, labels,sv,clabels, n_folds=10,MSE = False):
    cv = KFold(len(labels), n_folds,shuffle= True)
    accuracies = []
    training_MSE_list =[]
    testing_MSE_list = []
    i = 0
    for train_ind, test_ind in cv: 
        train_alpha,train_mean = training(data[train_ind], labels[train_ind],sv)
        training_MSE = mean_squrae_error(prediction(data[train_ind],train_mean,train_alpha,clabels), labels[train_ind])
        training_MSE_list.append(training_MSE)
        predict = prediction(data[test_ind],train_mean,train_alpha,clabels)
        testing_MSE_list.append(mean_squrae_error(predict,labels[test_ind]))
        accuracies.append(accuracy_score(labels[test_ind], predict))
        
    if MSE == True:
        for i in range(len(testing_MSE_list)):
            print 'Fold',i,'Testing Error',testing_MSE_list[i]
        print "Average Mean Square Error"
        print "Training Error \t Testing Error"
        print np.mean(training_MSE_list),"\t",np.mean(testing_MSE_list)
    else:
        for i in range(len(accuracies)):
            print 'Fold',i,'Accuracy',accuracies[i]
        print "Average Accuracy ", np.mean(accuracies)

In [38]:
def cross_validation(data, labels,sv,clabels, n_folds=10,MSE = False):
    cv = KFold(len(labels), n_folds,shuffle= True)
    accuracies = []
    precision_list = []
    recall_list = []
    training_MSE_list =[]
    testing_MSE_list = []
    i = 0
    for train_ind, test_ind in cv: 
        train_alpha,train_mean = training(data[train_ind], labels[train_ind],sv)
        training_MSE = mean_squrae_error(prediction(data[train_ind],train_mean,train_alpha,clabels), labels[train_ind])
        training_MSE_list.append(training_MSE)
        predict = prediction(data[test_ind],train_mean,train_alpha,clabels)
        p,r = roc(clabels,labels[test_ind],predict)
        precision_list.append(p)
        recall_list.append(r)
        testing_MSE_list.append(mean_squrae_error(predict,labels[test_ind]))
        accuracies.append(accuracy_score(labels[test_ind], predict))
        
    if MSE == True:
        for i in range(len(testing_MSE_list)):
            print 'Fold',i,'Testing Error',testing_MSE_list[i]
        print "Average Mean Square Error"
        print "Training Error \t Testing Error"
        print np.mean(training_MSE_list),"\t",np.mean(testing_MSE_list)
    else:
        for i in range(len(accuracies)):
            print 'Fold',i,'Accuracy',accuracies[i]
        print "Average Accuracy ", np.mean(accuracies)
    return precision_list,recall_list

In [22]:
def evaluation(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    print "Confusion Matrix"
    print confmatrix
    accuracy = find_accuracy(confmatrix)
    print "Accuracy", accuracy
    precision = find_precision(confmatrix)
    print "Precision", precision
    recall = find_recall(confmatrix)
    print "Recall", recall
    f_score =find_fmeasure(precision,recall)
    print "F_score", f_score

In [23]:
X = binarizer(X)

In [24]:
classes,class_count,alpha = find_alpha(Y)
print "The Classses are", classes
print "The Classes Count ", class_count
print "The prior probabiliy", alpha
multi_X_split =segregate_data(X,Y,classes)

The Classses are [0, 1]
The Classes Count  {0: 2788, 1: 1813}
The prior probabiliy {0: 0.6059552271245382, 1: 0.39404477287546186}


In [25]:
multi_sum,multi_mean = find_mean_multi(multi_X_split)
smoothing_value = 1
smoothed_mean = smoothing(multi_sum,multi_X_split,smoothing_value)
print "Smoothed Mean for mulivariate features"
for i in smoothed_mean:
    print i,smoothed_mean[i]

Smoothed Mean for mulivariate features
0 [ 0.09820789  0.27741935  0.00322581  0.22043011  0.11433692  0.01577061
  0.07383513  0.07849462  0.17060932  0.05125448  0.42401434  0.11935484
  0.04551971  0.01792115  0.090681    0.09569892  0.12580645  0.58064516
  0.0172043   0.34336918  0.00824373  0.02795699  0.01971326  0.37311828
  0.28136201  0.27706093  0.15555556  0.12939068  0.16200717  0.10430108
  0.07311828  0.12365591  0.07383513  0.15770609  0.17491039  0.26129032
  0.01863799  0.11577061  0.09032258  0.05304659  0.11541219  0.10430108
  0.10071685  0.29569892  0.16129032  0.01612903  0.06738351  0.18637993
  0.5516129   0.14336918  0.26810036  0.1046595   0.08243728  0.99964158
  0.99964158  0.99964158]
1 [ 0.34490358  0.61487603  0.02203857  0.62534435  0.37575758  0.4214876
  0.3415978   0.30633609  0.45619835  0.31294766  0.63030303  0.28705234
  0.12782369  0.15867769  0.54545455  0.384573    0.37961433  0.88650138
  0.20826446  0.80826446  0.05289256  0.3322314   0.3757

In [26]:
a,m = training(X,Y,smoothing_value)
predictions = prediction(X,m,a,classes)
print "Predicted Value \t True Value "
for i in range(10,21):
    print predictions[i],"\t\t\t",Y[i]

Predicted Value 	 True Value 
1 			[1]
1 			[1]
1 			[1]
1 			[1]
1 			[1]
1 			[1]
0 			[1]
1 			[1]
1 			[1]
1 			[1]
0 			[1]


In [27]:
cross_validation(X, Y,smoothing_value,classes)

Fold 0 Accuracy 0.889370932755
Fold 1 Accuracy 0.89347826087
Fold 2 Accuracy 0.90652173913
Fold 3 Accuracy 0.889130434783
Fold 4 Accuracy 0.889130434783
Fold 5 Accuracy 0.880434782609
Fold 6 Accuracy 0.913043478261
Fold 7 Accuracy 0.882608695652
Fold 8 Accuracy 0.886956521739
Fold 9 Accuracy 0.865217391304
Average Accuracy  0.889589267189


In [39]:
cross_validation(X, Y, smoothing_value,classes,MSE = True)

Fold 0 Testing Error [ 0.11930586]
Fold 1 Testing Error [ 0.09782609]
Fold 2 Testing Error [ 0.09782609]
Fold 3 Testing Error [ 0.1]
Fold 4 Testing Error [ 0.10434783]
Fold 5 Testing Error [ 0.11956522]
Fold 6 Testing Error [ 0.12391304]
Fold 7 Testing Error [ 0.12173913]
Fold 8 Testing Error [ 0.11086957]
Fold 9 Testing Error [ 0.11304348]
Average Mean Square Error
Training Error 	 Testing Error
0.110120977103 	0.110843629162


([[0.85858585858585856, 0.92073170731707321],
  [0.90666666666666662, 0.89375000000000004],
  [0.91078066914498146, 0.89005235602094246],
  [0.88513513513513509, 0.92682926829268297],
  [0.90969899665551834, 0.86956521739130432],
  [0.88235294117647056, 0.87765957446808507],
  [0.89198606271777003, 0.8497109826589595],
  [0.8896551724137931, 0.85882352941176465],
  [0.89456869009584661, 0.87755102040816324],
  [0.89323843416370108, 0.87709497206703912]],
 [[0.95149253731343286, 0.78238341968911918],
  [0.94117647058823528, 0.83625730994152048],
  [0.92105263157894735, 0.87628865979381443],
  [0.95620437956204385, 0.81720430107526887],
  [0.92832764505119458, 0.83832335329341312],
  [0.9125475285171103, 0.8375634517766497],
  [0.90780141843971629, 0.8258426966292135],
  [0.91489361702127658, 0.8202247191011236],
  [0.93959731543624159, 0.79629629629629628],
  [0.91941391941391937, 0.83957219251336901]])

In [29]:
evaluation(classes,Y,predictions)

Confusion Matrix
[[2594  194]
 [ 314 1499]]
Accuracy 0.889589219735
Precision [0.89202200825309486, 0.88541051388068515]
Recall [0.93041606886657102, 0.82680639823496971]
F_score [0.91081460674157311, 0.85510553337136341]
