# Part1 code and knn implementations

In [26]:
import random
import math 
import numpy as np
import time
import csv
import sys
from sklearn import svm

def readIrisCsv(): # This function is written for leaf.csv, because of order of features and labels
    csvOpen = open('iris.csv')
    csvReader = csv.reader(csvOpen) # using csv library to read iris.csv
    dataset = []
    for row in csvReader:
        dataset.append(row) # to csvreader to list
    random.shuffle(dataset) # shuffling list for k-fold-validation
    dataset_features = []
    dataset_labels = []
    for data in dataset: # iterating over dataset
        label = data.pop(len(data)-1) ## We are droping class and save it to label
        dataset_labels.append(label) # saving label to labels
        dataset_features.append(list(map(float,data))) # saving features but data is full of string, change to float list
    return dataset_labels,dataset_features # returning 

def readLeafCsv(): # This function is written for leaf.csv, because of order of features and labels
    csvOpen = open('leaf.csv')
    csvReader = csv.reader(csvOpen) # using csv library to read iris.csv
    dataset = []
    for row in csvReader:
        dataset.append(row) # to csvreader to list
    random.shuffle(dataset) # shuffling list for k-fold-validation
    dataset_features = []
    dataset_labels = []
    for data in dataset: # iterating over dataset
        label = data.pop(0) ## We are droping class and save it to label
        data.pop(0) ## We are droping speciman number, because it won't help
        dataset_labels.append(label) # saving label to labels
        dataset_features.append(list(map(float,data))) # saving features but data is full of string, change to float list
    return dataset_labels,dataset_features # returning them

def split(labels,features,n,k): # this function splits dataset using k-fold-cross-validation
        train_labels = []
        train_features = []
        test_labels = []
        test_features = []
        
        for i in range(0,len(labels)): # iterating dataset
            # This condition works like this : goo.gl/images/WNkSSV n is our iteration number k is our splitting factor.
            if i >= (len(labels)/k)*(n-1) and i < (len(labels)/k)*(n-1)+len(labels)/k: #splitting test data with n
                test_labels.append(labels[i])                                          #nth test data is chosen
                test_features.append(features[i])
            else:
                train_labels.append(labels[i])
                train_features.append(features[i])
                
        return train_labels,train_features,test_labels,test_features # returning splitted datas
    
def euclideanDistance(start,end): # This function calculates euclidianDistances of 2 points(dimension of R)
    if type(start) != type([]) or type(end) != type([]): # type check for one dimention
        return abs(start-end)
    if len(start) != len(end): # Points must be in same R
        return -99999
    insideofroot = 0.0
    
    for i in range(0, len(start)): # Iteraing dimention
        insideofroot+=(start[i]-end[i])**2 # For each dimention we do calculation
    return math.sqrt(insideofroot) # returning final distance

def doVotes(votes): # This function does voting with given votes array which contains [distance class]
    #print(votes)
    vote_count = []
    for i in range(0,len(votes)): #Iterating votes array
        vote_count.append(0)
        for ii in range(i,len(votes)):
            if votes[i][1] == votes[ii][1]: # If there is a same class vote
                vote_count[i]+=1 # Add one more vote to this class
    max_class = votes[0][1]
    max_vote = vote_count[0]
    for i in range(0,len(votes)): # Find maximum voted class
        if vote_count[i] > max_vote:
            max_class = votes[i][1]
    return max_class # returning class which is most repeatative class
            

def knn(labels,features,predict,method,k): # knn classifier with given train data set
    distances_labels = []
    for i in range(0,len(features)): # Iterating dataset
        distance_label = []
        if method == 'euclidean': # If method euclidean
            distance_label.append(euclideanDistance(features[i],predict)) #calculate distance between predict and data manhattan
        elif method == 'manhattan': # If method manhattan
            distance_label.append(manhattanDistance(features[i],predict)) # calculate distance between predict and data manhattan
        distance_label.append(labels[i]) # append class of this distance
        distances_labels.append(distance_label) # save it to distances_labels array
    
    # After calculating all distances
    sorted_distances_labels = sorted(distances_labels, key = lambda x: float(x[0])) # Sort this distances
    votes = sorted_distances_labels[:k] # Find first k distance-class arrays
    return doVotes(votes) # Do votes with them

def findLabels(labels): # This function finds how many different labels is there. for confusion matrix
    classes = []
    for label in labels:
        try:
            classes.index(label)
        except:
            classes.append(label)
    return classes

    
def testDataKnn(train_labels,train_features,test_labels,test_features,distance_method,classes): # Testing test data
    confusion_matrix = [[0 for x in range(len(classes))] for y in range(len(classes))] # Initialing confusion_matrix
    for i in range(0,len(test_labels)): # iterating test data
        prediction = knn(train_labels,train_features,test_features[i],distance_method,5) # find predion
        confusion_matrix[classes.index(prediction)][classes.index(test_labels[i])]+=1 # Filling confusion_matrix
    
    correct_predictions=0
    total_predictions=0
    for i in range(0,len(confusion_matrix)): # Iterating confusion_matrix to calculate accuracy
        for j in range(0,len(confusion_matrix)):
            if i==j: # Only predictions with same index numbers are the correct predictions
                correct_predictions+=confusion_matrix[i][i]
            total_predictions+=confusion_matrix[i][j] # Calculating all predictions for accuracy
    return correct_predictions/total_predictions # find accuracy and return

# part:1,2,3,4 dataset:"leaf","iris" , k: k-fold-crossvalidation and knn number
def hw1(part,dataset,k):
    start=time.time() # start time
    if dataset=="iris":
        labels,features = readIrisCsv() # read iris.csv
    elif dataset=="leaf":
        labels,features = readLeafCsv() # read leaf.csv
    else:
        print("Please choose between iris and leaf")
        return -1
    accuracies = []
    result = []
    classes = findLabels(labels)
    for i in range(1,k+1): # do k-fold-validation
        train_labels,train_features,test_labels,test_features = split(labels,features,i,k)
        if part==3 or part==4:
            result = testDataSvm(train_labels,train_features,test_labels,test_features,classes)
        elif part==1:
            result = testDataKnn(train_labels,train_features,test_labels,test_features,"euclidean",classes)
        elif part==2:
            result = testDataKnn(train_labels,train_features,test_labels,test_features,"manhattan",classes)
        else:
            print("This homework only has 4 parts")
            return -1
        accuracies.append(result) # find accuracies
    results = 0
    for i in range(0,len(accuracies)): # iterating accuracies
        results += accuracies[i] # add accuracies
    accuracy = results/len(accuracies) # calculate average
    end=time.time() # end time
    if part==1:
        print(dataset,".csv with euclidean distance:")
    elif part==2:
        print(dataset,".csv with manhattan distance:")
    elif part==3:
        print(dataset,".csv with Svm Linear:")
    elif part==4:
        print(dataset,".csv with Svm Polynomial:")

    print("accuracies:",accuracies)
    print("Average accuracy:",accuracy,
          " Performance time:",end-start,"s","Total samples:",len(labels))

hw1(1,"leaf",5)
hw1(1,"iris",5)

leaf .csv with euclidean distance:
accuracies: [0.5588235294117647, 0.5441176470588235, 0.5, 0.5735294117647058, 0.6029411764705882]
Average accuracy: 0.5558823529411765  Performance time: 1.781332015991211 s Total samples: 340
iris .csv with euclidean distance:
accuracies: [0.9666666666666667, 1.0, 0.9666666666666667, 0.9666666666666667, 0.9]
Average accuracy: 0.9600000000000002  Performance time: 0.20313239097595215 s Total samples: 150


As we can see although we have much more samples in leaf.csv ,our accuracy is pretty low when we compare to iris.csv. Cause is there are only 3 classes in iris. So we can train it much more efficiently than leaf. Every class has at least has 20-25 samples in iris but in leaf every class has 7-8 samples. Difference between performance times is very big. The reason of this, leaf has 14 dimension but iris has 4. Calculating euclidean distance takes much more time than calculating iris datasets.

# Part2: missing manhattanDistance implemantation is here

In [2]:
def manhattanDistance(start,end): # This function calculates euclidianDistances of 2 points(dimension of R)
    if type(start) != type([]) or type(end) != type([]): # type check for one dimention
        return abs(start-end)
    if len(start) != len(end): # Points must be in same R
        return -99999
    
    result = 0
    for i in range(0, len(start)): # Iteraing dimention
        result+=abs(start[i]-end[i]) # For each dimension we do calculation
    return result # returning final distance

hw1(2,"leaf",5)
hw1(2,"iris",5)

leaf .csv with manhattan distance:
accuracies: [0.5588235294117647, 0.5882352941176471, 0.6176470588235294, 0.7205882352941176, 0.6176470588235294]
Average accuracy: 0.6205882352941177  Performance time: 1.1496827602386475 s Total samples: 340
iris .csv with manhattan distance:
accuracies: [0.8666666666666667, 1.0, 0.9666666666666667, 1.0, 1.0]
Average accuracy: 0.9666666666666668  Performance time: 0.17537522315979004 s Total samples: 150


As we can see, manhattan distance is much more faster than euclidean distance. The reason of this, we took sqrt of summation of 
substructions in euclidean distance. But we lose some accuracy because manhattan distance is not complex as euclidean. This accuracy lose is not too big, if we want to know a prediction very fast we can use manhattan distance rather than euclidian.

# Part3 Svm code implemantations for linear

In [7]:
def svmTrain(train_labels,train_features,classes):
    X = np.array(train_features)
    mapped_train_labels = [] # All classes mapped to 0,1,2,3,4...
    for label in train_labels:
        mapped_train_labels.append(classes.index(label))
    y = mapped_train_labels
    clf = svm.SVC(kernel='linear', C = 100)
    clf.fit(X,y)
    return clf

def testDataSvm(train_labels,train_features,test_labels,test_features,classes):
    confusion_matrix = [[0 for x in range(len(classes))] for y in range(len(classes))] # Initialing confusion_matrix
    trainedClf = svmTrain(train_labels,train_features,classes)
    predictions = trainedClf.predict(test_features)
    for i in range(0,len(test_labels)):
        confusion_matrix[predictions[i]][classes.index(test_labels[i])]+=1 # Filling confusion_matrix
    correct_predictions = 0
    total_predictions = 0
    for i in range(0,len(confusion_matrix)): # Iterating confusion_matrix to calculate accuracy
        for j in range(0,len(confusion_matrix)):
            if i==j: # Only predictions with same index numbers are the correct predictions
                correct_predictions+=confusion_matrix[i][i]
            total_predictions+=confusion_matrix[i][j] # Calculating all predictions for accuracy
            
    return correct_predictions/total_predictions # find accuracy and return


hw1(3,"leaf",5)
hw1(3,"iris",5)

leaf .csv with Svm Linear:
accuracies: [0.6617647058823529, 0.6470588235294118, 0.75, 0.8088235294117647, 0.6764705882352942]
Average accuracy: 0.7088235294117646  Performance time: 0.1405041217803955 s Total samples: 340
iris .csv with Svm Linear:
accuracies: [1.0, 1.0, 0.9333333333333333, 0.9333333333333333, 1.0]
Average accuracy: 0.9733333333333334  Performance time: 0.01562643051147461 s Total samples: 150


As we can see Svm Linear works better than knn implementations. Both performance time and accuracy. I changed C 0.1 to 1000.
While C is getting bigger leaf.csv's accuracy is increasing. But iris.csv's accuracy getting decreasing. C = 100 is a good parameter for these datasets.

I couldnt figure out roc curves with multiple classes, but i do k-fold cross validation and used confusion matrixes anyway.

# Part4 with polynomial svm

In [27]:
def svmTrain(train_labels,train_features,classes):
    X = np.array(train_features)
    mapped_train_labels = [] # All classes mapped to 0,1,2,3,4...
    for label in train_labels:
        mapped_train_labels.append(classes.index(label))
    y = mapped_train_labels
    clf = svm.SVC(kernel='poly', C = 100,degree=3)
    clf.fit(X,y)
    return clf

hw1(4,"leaf",5)
hw1(4,"iris",5)

leaf .csv with Svm Polynomial:
accuracies: [0.5882352941176471, 0.6470588235294118, 0.6029411764705882, 0.6470588235294118, 0.4852941176470588]
Average accuracy: 0.5941176470588235  Performance time: 0.14404535293579102 s Total samples: 340
iris .csv with Svm Polynomial:
accuracies: [0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9]
Average accuracy: 0.9533333333333334  Performance time: 0.4468722343444824 s Total samples: 150


As we can see polynomial degree of 3 works worse than linear svm for our dataset.