In [1]:
import numpy as np
import math
from copy import deepcopy
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score,recall_score,f1_score
import matplotlib.pyplot as plt
%matplotlib inline
import io

In [2]:
filename = "data_banknote_authentication.txt"
r = io.open(filename, encoding='utf8').readlines()
X = []
Y = []
for i in r:
    x = i.split(',')
    for j in range(0,len(x)-1):
        try:
            x[j] = float(x[j])
        except ValueError:
            x[j] = 0.0
    X.append(map(float,x[0:len(x)-1]))
    Y.append([int(x[-1])])
Y = np.array(Y)

In [107]:
def create_data(data):
    z = data
    for j in z:
        j.insert(0,1)   
    return z

In [4]:
def find_alpha(class_labels):
    alpha1 = {}
    class_count1 = {}
    for i in class_labels:
        if i[0] not in class_count1.keys():
            class_count1[i[0]] = 1
        else:
            class_count1[i[0]] += 1
    classes1 = class_count1.keys()
    for j in  class_count1:
        alpha1[j] = class_count1[j]*1.0/len(class_labels)
    return classes1,class_count1,alpha1

In [5]:
def map_high_dimension(Data,degree):
    polyfeat_object = PolynomialFeatures(degree)
    hd_data = polyfeat_object.fit_transform(Data)
    return hd_data

In [6]:
def segregate_data(data,labels,clabels):
    multi_X = {}
    for i in range(len(clabels)):
        if clabels[i] not in multi_X.keys():
            multi_X[clabels[i]] = []
        for j in range(len(labels)):
            if labels[j][0] == clabels[i]:
                multi_X[clabels[i]].append(data[j]) 
    for i in multi_X:
        multi_X[i] = np.array(multi_X[i])
    return multi_X

In [7]:
def find_sigmoid(theta,data):
    sigmoid = []
    for i in data:
        x = np.dot(theta.transpose(),i)
        sigmoid.append([1.0/(1+np.exp(-x))])
    return np.array(sigmoid)

In [8]:
def gradient_descent(x, y, theta_assume, learning_rate, interation_count):
    theta = np.ones(len(x[0]))
    theta.fill(theta_assume)
    for i in range(interation_count):
        h = find_sigmoid(theta,x)
        new = learning_rate*(np.sum((h-y)*x,axis = 0))
        theta = theta - new
    return theta

In [9]:
def prediction(theta,data):
    sigmoid = find_sigmoid(theta,data)
    predict = []
    for i in sigmoid:
        if i[0] > 0.5:
            predict.append([1])
        elif i[0] < 0.5:
            predict.append([0])
    return np.array(predict)

In [10]:
def mean_squrae_error(pred,y):
    return sum([(i-j)**2 for i,j in zip(pred,y)])/float(len(pred))

In [11]:
def find_confusion_matrix(clabels,actual,predicted):
    cm= []
    for i in clabels:
        tmp =[0]*len(clabels)
        for j in range(len(actual)):
            if actual[j][0] == i and actual[j][0] == predicted[j][0]:
                tmp[clabels.index(i)] += 1
            elif actual[j][0] == i and actual[j][0] != predicted[j][0]:
                tmp[clabels.index(predicted[j][0])] += 1
        cm.append(tmp)
    return np.array(cm)

In [12]:
def find_accuracy(matrix):
    return np.trace(matrix)*1.0/np.sum(matrix)

In [13]:
def find_precision(matrix):
    pres = []
    x = np.sum(matrix,axis=0)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                pres.append(matrix[i][j]*1.0/x[i])
    return pres

In [14]:
def find_recall(matrix):
    rec = []
    x = np.sum(matrix,axis=1)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                rec.append(matrix[i][j]*1.0/x[i])
    return rec

In [15]:
def find_fmeasure(prec,rec):
    tmp = []
    for i,j in zip(prec,rec):
        tmp.append(2.0*(i*j)/(i+j))
    return tmp

In [16]:
def roc(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    precision = find_precision(confmatrix)
    recall = find_recall(confmatrix)
    return precision,recall,confmatrix

In [50]:
def cross_validation(data, labels,clabels, n_folds=10,MSE = False):
    cv = KFold(len(labels), n_folds,shuffle= True)
    accuracies = []
    precision_list = []
    confusion = []
    recall_list = []
    training_MSE_list =[]
    testing_MSE_list = []
    i = 0
    for train_ind, test_ind in cv: 
        train_theta = gradient_descent(data[train_ind], labels[train_ind],0.0001,0.0001,300)
        training_MSE = mean_squrae_error(prediction(train_theta,data[train_ind]), labels[train_ind])
        training_MSE_list.append(training_MSE)
        predict = prediction(train_theta,data[test_ind])
        p,r,cm = roc(clabels,labels[test_ind],predict)
        precision_list.append(p)
        recall_list.append(r)
        confusion.append(cm)
        testing_MSE_list.append(mean_squrae_error(predict,labels[test_ind]))
        accuracies.append(accuracy_score(labels[test_ind], predict))
        
    if MSE == True:
        for i in range(len(testing_MSE_list)):
            print 'Fold',i,'Testing Error',testing_MSE_list[i]
        print "Average Mean Square Error"
        print "Training Error \t Testing Error"
        print np.mean(training_MSE_list),"\t",np.mean(testing_MSE_list)
    else:
        for i in range(len(accuracies)):
            print 'Fold',i,'Accuracy',accuracies[i]
        print "Average Accuracy ", np.mean(accuracies)
    return precision_list,recall_list,training_MSE_list,confusion

In [130]:
def softmax(theta,data):
    soft = {}
    for i in data:
        s = 0
        for j in theta:
            s = s + np.exp(np.dot(theta[j].transpose(),i))
        for k in theta:
            x = np.exp(np.dot(theta[k].transpose(),i))
            if k not in soft.keys():
                soft[k] = [[(x*1.0)/s]]
            else:
                soft[k].append([(x*1.0)/s])
    return soft

In [79]:
def indicator(y,cl):
    ind = {}
    for k in cl:
        for l in y:
            if l[0] == k:
                if k not in ind.keys():
                    ind[k] = [[1]]
                else:
                    ind[k].append([1])
            else:
                if k not in ind.keys():
                    ind[k] = [[0]]
                else:
                    ind[k].append([0])
    return ind

In [152]:
def gradient_descent_soft(x, y, theta_assume, learning_rate, interation_count,classes):
    thetas = {}
    theta = np.ones(len(x[0]))
    theta.fill(theta_assume)
    for i in classes:
        thetas[i] = theta
    indicators = indicator(y,classes)
    for i in range(interation_count):
        h = softmax(thetas,x)
        for j in h:
            new = learning_rate*(np.sum((np.array(h[i])-np.array(indicators[i]))*x,axis = 0))
            thetas[i] = thetas[i] - new
    return thetas

In [65]:
def predict_kclass(theta,data,cl):
    pred = []
    predi = []
    for i in data:
        tmp = []
        for j in theta:
            tmp.append((j,np.dot(theta[j].transpose(),i)))
        pred.append(tmp)
    for i in pred:
        predi.append([max(i,key=lambda item:item[1])[0]])
    return predi

In [96]:
def cross_validation_kclass(data, labels,clabels, n_folds=10):
    cv = KFold(len(labels), n_folds,shuffle= True)
    precision_list = []
    confusion = []
    recall_list = []
    training_MSE_list =[]
    testing_MSE_list = []
    i = 0
    for train_ind, test_ind in cv: 
        train_theta = gradient_descent_soft(data[train_ind], labels[train_ind],0.0000001,0.0001,300,clabels)
        training_MSE = mean_squrae_error(predict_kclass(train_theta,data[train_ind],clabels), labels[train_ind])
        training_MSE_list.append(training_MSE)
        predict = predict_kclass(train_theta,data[test_ind],clabels)
        p,r,cm = roc(clabels,labels[test_ind],predict)
        precision_list.append(p)
        recall_list.append(r)
        confusion.append(cm)
        testing_MSE_list.append(mean_squrae_error(predict,labels[test_ind]))
    for i in range(len(testing_MSE_list)):
        print 'Fold',i,'Testing Error',testing_MSE_list[i]
    print "Average Mean Square Error"
    print "Training Error \t Testing Error"
    print np.mean(training_MSE_list),"\t",np.mean(testing_MSE_list)
    return precision_list,recall_list,training_MSE_list,confusion

In [22]:
def evaluation(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    print "Confusion Matrix"
    print confmatrix
    accuracy = find_accuracy(confmatrix)
    print "Accuracy", accuracy
    precision = find_precision(confmatrix)
    print "Precision", precision
    recall = find_recall(confmatrix)
    print "Recall", recall
    f_score =find_fmeasure(precision,recall)
    print "F_score", f_score

In [23]:
Z= deepcopy(X)
Z = create_data(Z)
Z = np.array(Z)
X = np.array(X)
classes,class_count,alpha = find_alpha(Y)
Z_split =segregate_data(Z,Y,classes)
X_split = segregate_data(X,Y,classes)

In [24]:
logistic_theta = gradient_descent(X,Y,0.01,0.001,300)
predictions = prediction(logistic_theta,X)
print "Predicted Value \t True Value "
for i in range(11,21):
    print predictions[i],"\t\t\t",Y[i]

Predicted Value 	 True Value 
[0] 			[0]
[0] 			[0]
[0] 			[0]
[0] 			[0]
[0] 			[0]
[0] 			[0]
[0] 			[0]
[0] 			[0]
[0] 			[0]
[0] 			[0]


In [137]:
print logistic_theta

[-2.68534191 -1.64560428 -1.74869086 -0.82080437]


In [25]:
pre_roc, recall_roc,test_error,conf_mat = cross_validation(X, Y,classes)

Fold 0 Accuracy 0.927536231884
Fold 1 Accuracy 0.920289855072
Fold 2 Accuracy 0.905109489051
Fold 3 Accuracy 0.912408759124
Fold 4 Accuracy 0.963503649635
Fold 5 Accuracy 0.890510948905
Fold 6 Accuracy 0.941605839416
Fold 7 Accuracy 0.897810218978
Fold 8 Accuracy 0.941605839416
Fold 9 Accuracy 0.978102189781
Average Accuracy  0.927848302126


In [146]:
for i in range(len(test_error)):
        print 'Fold',i,'Testing Error',test_error[i]
print "Average Mean Square Error"
print np.mean(test_error)

Fold 0 Testing Error [ 0.06401945]
Fold 1 Testing Error [ 0.05996759]
Fold 2 Testing Error [ 0.06072874]
Fold 3 Testing Error [ 0.06072874]
Fold 4 Testing Error [ 0.06477733]
Fold 5 Testing Error [ 0.05910931]
Fold 6 Testing Error [ 0.07287449]
Fold 7 Testing Error [ 0.06963563]
Fold 8 Testing Error [ 0.06558704]
Fold 9 Testing Error [ 0.06882591]
Average Mean Square Error
0.0646254240513


In [26]:
logistic_thetaz = gradient_descent(Z,Y,0.01,0.001,300)
predictions_z = prediction(logistic_thetaz,Z)
pre_rocz, recall_rocz,test_errorz,conf_matz = cross_validation(Z, Y,classes)

Fold 0 Accuracy 0.985507246377
Fold 1 Accuracy 0.985507246377
Fold 2 Accuracy 0.985401459854
Fold 3 Accuracy 0.978102189781
Fold 4 Accuracy 0.992700729927
Fold 5 Accuracy 0.992700729927
Fold 6 Accuracy 0.992700729927
Fold 7 Accuracy 0.963503649635
Fold 8 Accuracy 0.978102189781
Fold 9 Accuracy 0.978102189781
Average Accuracy  0.983232836137


In [147]:
for i in range(len(test_errorz)):
        print 'Fold',i,'Testing Error',test_errorz[i]
print "Average Mean Square Error"
print np.mean(test_errorz)

Fold 0 Testing Error [ 0.01863857]
Fold 1 Testing Error [ 0.01134522]
Fold 2 Testing Error [ 0.0194332]
Fold 3 Testing Error [ 0.01214575]
Fold 4 Testing Error [ 0.02024291]
Fold 5 Testing Error [ 0.01214575]
Fold 6 Testing Error [ 0.02024291]
Fold 7 Testing Error [ 0.01214575]
Fold 8 Testing Error [ 0.01781377]
Fold 9 Testing Error [ 0.0194332]
Average Mean Square Error
0.0163587031411


In [27]:
evaluation(classes,Y,predictions_z)

Confusion Matrix
[[752  10]
 [  3 607]]
Accuracy 0.990524781341
Precision [0.99602649006622512, 0.98379254457050247]
Recall [0.98687664041994749, 0.9950819672131147]
F_score [0.99143045484508896, 0.98940505297473513]


In [28]:
X_hd = map_high_dimension(X,2)

In [99]:
logistic_theta_poly = gradient_descent(X_hd,Y,0.0001,0.0001,300)
predictions_poly = prediction(logistic_theta_poly,X_hd)
pre_rocp, recall_rocp,test_errorp,conf_matp = cross_validation(X_hd, Y,classes)

Fold 0 Accuracy 0.992753623188
Fold 1 Accuracy 1.0
Fold 2 Accuracy 1.0
Fold 3 Accuracy 0.992700729927
Fold 4 Accuracy 0.992700729927
Fold 5 Accuracy 0.992700729927
Fold 6 Accuracy 1.0
Fold 7 Accuracy 1.0
Fold 8 Accuracy 0.992700729927
Fold 9 Accuracy 0.992700729927
Average Accuracy  0.995625727282


In [148]:
print "Average Mean Square Error"
print np.mean(test_errorp)

Average Mean Square Error
0.0034012690372


In [32]:
evaluation(classes,Y,predictions_poly)

Confusion Matrix
[[762   0]
 [  8 602]]
Accuracy 0.99416909621
Precision [0.98961038961038961, 1.0]
Recall [1.0, 0.9868852459016394]
F_score [0.99477806788511747, 0.99339933993399332]


In [119]:
filename = "iris.data.txt"
r = io.open(filename, encoding='utf8').readlines()
A = []
B = []
for i in r[0:150:
    x = i.split(',')
    A.append(map(float,x[0:len(x)-1]))
    if x[-1] == "Iris-setosa\n":
        B.append([1])
    elif x[-1] == 'Iris-versicolor\n':
        B.append([2])
    elif x[-1] == 'Iris-virginica\n':
        B.append([3])
ZA = deepcopy(A)
ZA = np.array(create_data(ZA))
A = np.array(A)
B = np.array(B)

In [120]:
classes_B,class_count_B,alpha_B = find_alpha(B)
multi_X_splitA =segregate_data(A,B,classes_B)

In [121]:
soft_theta = gradient_descent_soft(A,B,0.0001,0.0001,300,classes_B)

In [122]:
xx = predict_kclass(soft_theta,A,classes_B)

In [123]:
evaluation(classes_B,B,predict_kclass(soft_theta,A,classes_B))

Confusion Matrix
[[50  0  0]
 [ 0 38 12]
 [ 0  0 50]]
Accuracy 0.92
Precision [1.0, 1.0, 0.80645161290322576]
Recall [1.0, 0.76000000000000001, 1.0]
F_score [1.0, 0.86363636363636365, 0.89285714285714279]


In [133]:
pre_rock, recall_rock,test_errork,conf_matk = cross_validation_kclass(A, B,classes_B)

Fold 0 Testing Error [ 0.13333333]
Fold 1 Testing Error [ 0.]
Fold 2 Testing Error [ 0.13333333]
Fold 3 Testing Error [ 0.06666667]
Fold 4 Testing Error [ 0.06666667]
Fold 5 Testing Error [ 0.06666667]
Fold 6 Testing Error [ 0.2]
Fold 7 Testing Error [ 0.06666667]
Fold 8 Testing Error [ 0.06666667]
Fold 9 Testing Error [ 0.4]
Average Mean Square Error
Training Error 	 Testing Error
0.107407407407 	0.12


In [134]:
acc = []
for i in conf_matk:
    acc.append(find_accuracy(i))
print "Mean Accuracy", np.mean(acc)

Mean Accuracy 0.88


In [131]:
pre_rock1, recall_rock1,test_errork1,conf_matk1 = cross_validation_kclass(ZA, B,classes_B)

Fold 0 Testing Error [ 0.2]
Fold 1 Testing Error [ 0.]
Fold 2 Testing Error [ 0.13333333]
Fold 3 Testing Error [ 0.06666667]
Fold 4 Testing Error [ 0.13333333]
Fold 5 Testing Error [ 0.]
Fold 6 Testing Error [ 0.33333333]
Fold 7 Testing Error [ 0.]
Fold 8 Testing Error [ 0.13333333]
Fold 9 Testing Error [ 0.06666667]
Average Mean Square Error
Training Error 	 Testing Error
0.0962962962963 	0.106666666667


In [132]:
accz = []
for i in conf_matk1:
    accz.append(find_accuracy(i))
print "Mean Accuracy", np.mean(accz)

Mean Accuracy 0.893333333333


In [149]:
for i in conf_matk1:
    print i

[[3 0 0]
 [0 3 3]
 [0 0 6]]
[[4 0 0]
 [0 5 0]
 [0 0 6]]
[[9 0 0]
 [0 2 2]
 [0 0 2]]
[[6 0 0]
 [0 4 1]
 [0 0 4]]
[[3 0 0]
 [0 4 2]
 [0 0 6]]
[[4 0 0]
 [0 4 0]
 [0 0 7]]
[[3 0 0]
 [0 3 5]
 [0 0 4]]
[[7 0 0]
 [0 3 0]
 [0 0 5]]
[[4 0 0]
 [0 4 2]
 [0 0 5]]
[[7 0 0]
 [0 2 1]
 [0 0 5]]


In [151]:
np.sum(conf_matk1,axis = 0)

array([[50,  0,  0],
       [ 0, 34, 16],
       [ 0,  0, 50]])