In [2]:
import numpy as np
import math
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score,recall_score,f1_score
import io

In [3]:
filename = "breast-cancer-wisconsin.data.txt"
r = io.open(filename, encoding='utf8').readlines()
X = []
Y = []
for i in r:
    x = i.split(',')
    for j in range(1,len(x)-1):
        try:
            x[j] = int(x[j])
        except ValueError:
            x[j] = 0
    X.append(map(int,x[1:len(x)-1]))
    Y.append([int(x[-1])])

In [4]:
Y = np.array(Y)
X = np.array(X)

In [5]:
def find_alpha(class_labels):
    alpha1 = {}
    class_count1 = {}
    for i in class_labels:
        if i[0] not in class_count1.keys():
            class_count1[i[0]] = 1
        else:
            class_count1[i[0]] += 1
    classes1 = class_count1.keys()
    for j in  class_count1:
        alpha1[j] = class_count1[j]*1.0/len(class_labels)
    return classes1,class_count1,alpha1

In [6]:
def find_mean(data,labels,n):
    feature_sum1 = {}
    for i in range(len(labels)):
        if labels[i][0] not in feature_sum1.keys():
            feature_sum1[labels[i][0]] = data[i][0]
        else:
            feature_sum1[labels[i][0]] += data[i][0]
    mean_class1 ={}
    for i in n:
        mean_class1[i] = feature_sum1[i]*1.0/n[i]
    return mean_class1

In [7]:
def find_covariance(data,labels,mean,n):
    covariance_class1 = {}
    for i in range(len(labels)):
        if labels[i][0] not in covariance_class1.keys():
            covariance_class1[labels[i][0]] = (data[i][0] - mean[labels[i][0]])**2
        else:
            covariance_class1[labels[i][0]] += (data[i][0] - mean[labels[i][0]])**2
    for i in covariance_class1:
        covariance_class1[i] = covariance_class1[i]*1.0/n[i]
    return covariance_class1

In [8]:
def training(x,y):
    cl,n,alph = find_alpha(y)
    mean_cl = find_mean(x,y,n)
    covariance_cl = find_covariance(x,y,mean_cl,n)
    return alph,mean_cl,covariance_cl

In [9]:
def membership(data,sigma,mean,prior):
    mem = []
    for i in mean:
        y = (((data-mean[i])**2)*1.0)/(2*sigma[i]**2)
        x = math.log(prior[i])-math.log(sigma[i])- y
        mem.append(x)
    return mem

In [10]:
def determinist(data):
    return data.index(max(data))

In [11]:
def find_class(data,clabels):
    return clabels[data]

In [12]:
def prediction(data,sigma,mean,prior,clabels):
    member = []
    for i in data:
        member.append(membership(i[0],sigma,mean,prior))
    determine =[]
    for j in member:
        determine.append(determinist(j))
    predict = [find_class(i,clabels) for i in determine]
    return predict

In [13]:
def mean_squrae_error(pred,y):
    return sum([(i-j)**2 for i,j in zip(pred,y)])/float(len(pred))

In [14]:
def cross_validation(data, labels,clabels, n_folds=10,MSE = False):
    cv = KFold(len(labels), n_folds)
    accuracies = []
    training_MSE_list =[]
    testing_MSE_list = []
    precision_list = []
    recall_list = []
    i = 0
    for train_ind, test_ind in cv: 
        train_alpha,train_mean,train_covar = training(data[train_ind], labels[train_ind])
        training_MSE = mean_squrae_error(prediction(data[train_ind],train_covar,train_mean,train_alpha,clabels), labels[train_ind])
        training_MSE_list.append(training_MSE)
        predict = prediction(data[test_ind],train_covar,train_mean,train_alpha,clabels)
        testing_MSE_list.append(mean_squrae_error(predict,labels[test_ind]))
        accuracies.append(accuracy_score(labels[test_ind], predict))
        p,r = roc(clabels,labels[test_ind],predict)
        precision_list.append(p)
        recall_list.append(r)
    if MSE == True:
        for i in range(len(testing_MSE_list)):
            print 'Fold',i,'Testing Error',testing_MSE_list[i]
        print "Average Mean Square Error"
        print "Training Error \t Testing Error"
        print np.mean(training_MSE_list),"\t",np.mean(testing_MSE_list)
    else:
        for i in range(len(accuracies)):
            print 'Fold',i,'Accuracy',accuracies[i]
        print "Average Accuracy ", np.mean(accuracies)
    return precision_list,recall_list

In [15]:
def find_confusion_matrix(clabels,actual,predicted):
    cm= []
    for i in clabels:
        tmp =[0]*len(clabels)
        for j in range(len(actual)):
            if actual[j][0] == i and actual[j][0] == predicted[j]:
                tmp[clabels.index(i)] += 1
            elif actual[j][0] == i and actual[j][0] != predicted[j]:
                tmp[clabels.index(predicted[j])] += 1
        cm.append(tmp)
    return np.array(cm)
            

In [16]:
def find_accuracy(matrix):
    return np.trace(matrix)*1.0/np.sum(matrix)

In [17]:
def find_precision(matrix):
    pres = []
    x = np.sum(matrix,axis=0)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                pres.append(matrix[i][j]*1.0/x[i])
    return pres

In [18]:
def find_recall(matrix):
    rec = []
    x = np.sum(matrix,axis=1)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                rec.append(matrix[i][j]*1.0/x[i])
    return rec

In [19]:
def find_fmeasure(prec,rec):
    tmp = []
    for i,j in zip(prec,rec):
        tmp.append(2.0*(i*j)/(i+j))
    return tmp

In [20]:
def roc(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    precision = find_precision(confmatrix)
    recall = find_recall(confmatrix)
    return precision,recall

In [21]:
def evaluation(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    print "Confusion Matrix"
    print confmatrix
    accuracy = find_accuracy(confmatrix)
    print "Accuracy", accuracy
    precision = find_precision(confmatrix)
    print "Precision", precision
    recall = find_recall(confmatrix)
    print "Recall", recall
    f_score =find_fmeasure(precision,recall)
    print "F_score", f_score

In [22]:
classes,class_count,alpha = find_alpha(Y)
print "The Classses are", classes
print "The Classes Count ", class_count
print "The prior probabiliy", alpha

The Classses are [2, 4]
The Classes Count  {2: 458, 4: 241}
The prior probabiliy {2: 0.6552217453505007, 4: 0.3447782546494993}


In [23]:
X_uni = np.array([[x[2]] for x in X])

In [24]:
mean_class = find_mean(X_uni,Y,class_count)
print "The Mean for each class"
for i in mean_class:
    print i,"\t",mean_class[i]

The Mean for each class
2 	1.44323144105
4 	6.5601659751


In [25]:
covariance_class = find_covariance(X_uni,Y,mean_class,class_count)
print "The sigma for each class"
for i in covariance_class:
    print i,"\t",covariance_class[i]

The sigma for each class
2 	0.993502221544
4 	6.53683648698


In [26]:
predictions = prediction(X_uni,covariance_class,mean_class,alpha,classes)
print "Predicted Value \t True Value "
for i in range(11,21):
    print predictions[i],"\t\t\t",Y[i]

Predicted Value 	 True Value 
2 			[2]
2 			[4]
2 			[2]
4 			[4]
4 			[4]
2 			[2]
2 			[2]
4 			[4]
2 			[2]
2 			[4]


In [27]:
pre,rec = cross_validation(X_uni, Y,classes)

Fold 0 Accuracy 0.842857142857
Fold 1 Accuracy 0.928571428571
Fold 2 Accuracy 0.971428571429
Fold 3 Accuracy 0.914285714286
Fold 4 Accuracy 0.842857142857
Fold 5 Accuracy 0.914285714286
Fold 6 Accuracy 0.928571428571
Fold 7 Accuracy 0.985714285714
Fold 8 Accuracy 0.957142857143
Fold 9 Accuracy 0.942028985507
Average Accuracy  0.922774327122


In [28]:
cross_validation(X_uni, Y,classes,MSE= True)

Fold 0 Testing Error [ 0.62857143]
Fold 1 Testing Error [ 0.28571429]
Fold 2 Testing Error [ 0.11428571]
Fold 3 Testing Error [ 0.34285714]
Fold 4 Testing Error [ 0.62857143]
Fold 5 Testing Error [ 0.34285714]
Fold 6 Testing Error [ 0.28571429]
Fold 7 Testing Error [ 0.05714286]
Fold 8 Testing Error [ 0.17142857]
Fold 9 Testing Error [ 0.23188406]
Average Mean Square Error
Training Error 	 Testing Error
0.309011532541 	0.308902691511


([[0.80000000000000004, 0.90000000000000002],
  [0.91304347826086951, 0.95833333333333337],
  [0.97727272727272729, 0.96153846153846156],
  [0.93333333333333335, 0.90000000000000002],
  [0.80952380952380953, 0.8928571428571429],
  [0.94444444444444442, 0.8125],
  [0.94117647058823528, 0.89473684210526316],
  [0.98333333333333328, 1.0],
  [0.95918367346938771, 0.95238095238095233],
  [1.0, 0.76470588235294112]],
 [[0.91428571428571426, 0.77142857142857146],
  [0.97674418604651159, 0.85185185185185186],
  [0.97727272727272729, 0.96153846153846156],
  [0.875, 0.94736842105263153],
  [0.91891891891891897, 0.75757575757575757],
  [0.94444444444444442, 0.8125],
  [0.95999999999999996, 0.84999999999999998],
  [1.0, 0.90909090909090906],
  [0.97916666666666663, 0.90909090909090906],
  [0.9285714285714286, 1.0]])

In [29]:
confusion_matrix(Y,predictions)

array([[436,  22],
       [ 32, 209]])

In [30]:
evaluation(classes,Y,predictions)

Confusion Matrix
[[436  22]
 [ 32 209]]
Accuracy 0.922746781116
Precision [0.93162393162393164, 0.90476190476190477]
Recall [0.95196506550218341, 0.86721991701244816]
F_score [0.94168466522678185, 0.88559322033898302]


In [33]:
print "Precision"
print"Class2 \t\t Class4"
for i in pre:
    print i[0]," \t\t ",i[1]
print "Mean Precision"
print np.mean(pre,axis = 0)

Precision
Class2 		 Class4
0.8  		  0.9
0.913043478261  		  0.958333333333
0.977272727273  		  0.961538461538
0.933333333333  		  0.9
0.809523809524  		  0.892857142857
0.944444444444  		  0.8125
0.941176470588  		  0.894736842105
0.983333333333  		  1.0
0.959183673469  		  0.952380952381
1.0  		  0.764705882353
Mean Precision
[ 0.92613113  0.90370526]


In [34]:
precision_score((Y.T).tolist()[0],predictions,pos_label= None,average= None)

array([ 0.93162393,  0.9047619 ])

In [35]:
recall_score((Y.T).tolist()[0],predictions,pos_label= None,average= None)

array([ 0.95196507,  0.86721992])

In [36]:
f1_score((Y.T).tolist()[0],predictions,pos_label= None,average= None)

array([ 0.94168467,  0.88559322])