# Importing libraries


In [None]:
from math import sqrt, pi, exp
from random import seed, randrange
import pandas as pd
import numpy as np
from copy import deepcopy
import operator

# Reading and normalizing of the dataset

In [None]:
# Reading the dataset with pandas
dataset = pd.read_csv('glass.csv')
dataset2 = pd.read_csv('Concrete_Data_Yeh.csv')

dataset = dataset.values

FileNotFoundError: ignored

In [None]:
# Min-max normalization
normalized_dataset = np.copy(dataset)

for i in range(9):
    v = normalized_dataset[:, i]
    normalized_dataset[:, i] = (v - v.min()) / (v.max() - v.min())

In [None]:
# Min-max normalization
normalized_dataset2 = dataset2

for i in range(8):
    v = normalized_dataset2.iloc[:, i]
    normalized_dataset2.iloc[:, i] = (v - v.min()) / (v.max() - v.min())

# Classification

In [None]:
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1) - 1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [None]:
def make_weights(k, distances):
    result = np.zeros(k, dtype=np.float32)
    
    for i in range(k):
        if distances[i] == 0:
            result[i] += 1 / 0.000000001
        else:
            result[i] += 1 / distances[i]

    return result

In [None]:
def get_weighted_pred(k, k_near_dists, data, ordering):
    votes = dict()

    wts = make_weights(k, k_near_dists)
    
    # wts = [20, 10, 8, 5, 4]
    # distance = [10, 20, 25, 40, 55] - train[50], train[1], train[2]... len(train) = 164
    # ordering = [50, 1, 2, 4, 3]
    # type = [1, 2, 1, 3, 2] - train[50][-1], train[1][-1], train[2][-1]

    for i in range(k):
        idx = ordering[i]
        pred_class = data[idx][-1]
        if str(pred_class) not in votes.keys():
            votes[str(pred_class)] = wts[i]
        else:
            votes[str(pred_class)] += wts[i]
        
    # finding the most predicted class in weighted predictions
    max_k = -999999999
    max_v = -999999999
    for k, v in votes.items():
        if v > max_v:
            max_v = v
            max_k = k
    
    return float(max_k)

In [None]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = np.zeros(len(train))
    for i in range(len(train)):
        distances[i] = euclidean_distance(test_row, train[i])
    
    ordering = distances.argsort()

    k_near_dists = np.zeros(num_neighbors)
    neighbors = []
    for i in range(num_neighbors):
        idx = ordering[i]
        k_near_dists[i] = distances[idx]  # save dists
        neighbors.append(train[idx])
    
    return neighbors, get_weighted_pred(num_neighbors, k_near_dists, train, ordering)

In [None]:
# Make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
	neighbors, weighted_prediction = get_neighbors(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(output_values, key=output_values.count)
	
	return prediction, weighted_prediction

In [None]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold) 

    return dataset_split

In [None]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1

	return correct / len(actual) * 100

In [None]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)

    scores = []
    weighted_scores = []
    for i, fold in enumerate(folds):
        # splitting the test set
        train_set = deepcopy(folds)
        del train_set[i]
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = np.copy(row)
            row_copy[-1] = None
            test_set.append(row_copy)
            

        predicted, weighted_predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
        weighted_scores.append(accuracy_metric(actual, weighted_predicted))
    

    return scores, weighted_scores

In [None]:
# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    weighted_predictions = list()
    for row in test:
        output, weighted_prediction = predict_classification(train, row, num_neighbors)
        predictions.append(output)
        weighted_predictions.append(weighted_prediction)

    return predictions, weighted_predictions

When the k value is increased, we can see that accuracy of the folds -and mean accuracy too - is decreased. Since there is not really much value to train, increasing the k value causes include the pointless (useless) points, and because of this, model can not predict the samples with a good accuracy.

# REGRESSION AND WEIGHTED KNN/KNN

Gaussian function for weights of knn which takes distance and sigma value the user chooses.
Sigma changes the gaussian graph which is affecting the output of the algorithm depending on the dataset given.
With our dataset and weighted knn, the sigma value had to be higher, othervise it would give results close to zeros.
Sigma values therefore is assigned as 13.

In [None]:
def gaussian(dist, sigma=13):   # weighted knn
        return 1./(sqrt(2.*pi)*sigma)*exp(-dist**2/(2*sigma**2))

Predict function handles the knn part, which has the inputs of xtest xtrain ytrain k and boolean weighted to give information about if the given datas will be used as weighted knn or knn.
Firstly, dataset is changed to pandas if numpy array and an empty predictions array is set.
Secondly, the distances are found within the loop and appended to the distances array with ytrain row as well and sorted.
Lastly, if the weighted is true, the function uses the gaussian weight with distances array inputed to its function and distances are multiplied with the weights and the predictions are added to array as divided by the total weight, else if the weighted is false, the distances are fed as values and the prediction is added as value divided by given k which is number of nearest neighbours given. Output is the prediction array.

In [None]:
def predict(xTest, xTrain, yTrain, k, weighted):   #weighted knn/ normal knn
        
        if isinstance(xTest, np.ndarray):
            xTest = pd.DataFrame(xTest)
            
        if isinstance(xTrain, np.ndarray):
            xTrain = pd.DataFrame(xTrain)
        
        predictions = []
        
        for i in range(len(xTest)):
            #newdist = np.zeros(len(yTrain))
            
            distances = []
            
            for j in range(len(yTrain)):
                distanc = distance(xTrain.iloc[j,:], xTest.iloc[i,:])
                distances.append((yTrain[j], distanc))
                #newdist[j] = distance(xTrain.iloc[j,:], xTest.iloc[i,:])
            
            #newdist = np.array([newdist, yTrain])  # distances
            #idx = np.argsort(newdist[0,:])
            #newdist = newdist[:,idx]
            distances.sort(key=operator.itemgetter(1))
            
            v = 0
            total_weight = 0
            for i in range(k):
                weight = gaussian(distances[i][1])
                #print(str(weight) + 'for this sample')
                if weighted:
                    v += distances[i][0]*weight
                else:
                    v += distances[i][0]
                total_weight += weight
            #print('toal weight' + str(total_weight))
            #print('v' + str(v))
            if weighted:
                predictions.append(v/total_weight)
            else:
                predictions.append(v/k)
            
        return predictions

Distance function used in knn which returns the euclidean distance given the two arrays of vectors.

In [None]:
def distance(pa,pb):
    distance = 0
    for i in range(len(pa)):
        distance = distance + np.square(pa[i] - pb[i])
    return distance**0.5

5 fold cross validation split is handled here.
The function takes the given dataset and splits it to 5 folds and the folds are outputted as array.

In [None]:
def crossValSplit(dataset):
       
        split = []
        copy = dataset
        foldSize = int(copy.shape[0] / 5)
        print(len(copy))
        for i in range(5):
            fold = []
            while len(fold) < foldSize:
                r = randrange(copy.shape[0])
                index = copy.index[r]
                fold.append(copy.loc[index].values.tolist())
                copy = copy.drop(index)
            split.append(np.asarray(fold))
        return split

Calculate mean absolute error(mae) with the given actual array and the predicted array that is given by predict function of the folds.

In [None]:
# Calculate mean absolute error
def MAE(actual, predicted):
    mae = 0
    print('len(actual)' + str(len(actual)))
    print('len(predicted)' + str(len(predicted)))
    for i in range(len(actual)):
        mae += abs(actual[i]-predicted[i])
    
    mae /= len(actual)
    return mae

Main function that handles the knn regression.
Takes the given dataset, number of folds, k value of knn, and boolean of weight to use weighted or not.
Firstly, folds are handled with the crossvalsplit function given the dataset and an array of scores are set as empty.
Secondly, the folds are splitted as xTrain yTrain XTest and yTest in the loop with folds array and the cv arrray. folds[i][:,0:8] corresponds to xTest, cv[:,8:8] corresponds as xTrain and cv[:,8] corresponds as yTrain and the values of 8 are coming from the dataset since it has 9 columns present.
These arrays are sent to predict function with booleans given as weighted or not.
Lastly, the output of predict is used in the mae function to see the error of the predicted values of arrays and the scores of mae are appended to scores array and given as result.

In [None]:
def kFCVEvaluate(dataset, n_folds, k, weighted):
        '''
        Description:
            Driver function for k-Fold cross validation 
        '''
        #knn = kNNClassifier()
        
        folds = crossValSplit(dataset)
        
        scores = []
        
        anan = True
        for i in range(5):   #splitting the dataset to xtrain ytrain xtest ytest for folds
            r = list(range(5))
            r.pop(i)
            
            for j in r:
                if j == r[0]:
                    cv = folds[j]
                else:
                    cv = np.concatenate((cv, folds[j]), axis = 0) # until here
           
            if weighted:
                predicted = predict(folds[i][:,0:8], cv[:,0:8], cv[:,8], k, True)     #find neighbours and the predicted values
            else:
                neighborsnpred = predict( folds[i][:,0:8], cv[:,0:8], cv[:,8], k, False)  #find neighbours and the predicted values
            
            if weighted:
                acc = MAE(folds[i][:,8], predicted)    #finding the mae of each fold
                scores.append(acc)
            else:
                acc = MAE(folds[i][:,8], neighborsnpred)   #finding the mae of each fold
                scores.append(acc)
            print('fold'  + str(i) + 'done, accuracy:' + str(acc) )
        return scores

# Main

In [None]:
seed(1)
# evaluate algorithm
n_folds = 5

flag = False
for data in [dataset, normalized_dataset]:
    print("##########################################################")
    if flag:
        print("Dataset with feature normalization")
    else:
        print("Dataset without feature normalization")
        flag = True

    for num_neighbors in [1, 3, 5, 7, 9]:
        scores, weighted_scores = evaluate_algorithm(data, k_nearest_neighbors, n_folds, num_neighbors)
        print("----------------------------------------------------------")
        print(num_neighbors, "neighbor(s)")
        print('Fold scores: %s' % scores)
        print('Mean accuracy: %.3f%%' % (sum(scores) / float(len(scores))))
        print('Fold (weighted) scores: %s' % weighted_scores)
        print('Mean accuracy: %.3f%%' % (sum(weighted_scores) / float(len(weighted_scores))))


#-------------- regresssion --------------
for num_neighbors in [1, 3, 5, 7, 9]:
    scores = kFCVEvaluate(normalized_dataset2, n_folds, num_neighbors, False) # knn + regression
    print(scores)
    print(num_neighbors, "neighbor(s)")
    print('Mean Absolute Error over 5 folds: %.3f%%' % (sum(scores)/float(len(scores))))

#-----------weighted regression-----------
for num_neighbors in [1, 3, 5, 7, 9]:
    scores2 = kFCVEvaluate(normalized_dataset2, n_folds, num_neighbors, True) # weighted knn + regression
    print(scores2)
    print(num_neighbors, "neighbor(s)")
    print('Mean Absolute Error over 5 folds: %.3f%%' % (sum(scores2)/float(len(scores2))))

||Fold 1|Fold 2|Fold 3|Fold 4|Fold 5|Fold avg|
|------|------|------|------|------|------|--------|
|1nn non-normalized classification |83.333  |57.142  | 64.285 | 76.190 | 80.952 | 72.381   |
|3nn non-normalized classification|78.571  |80.952  | 66.666 | 66.666 | 64.285 | 71.429  |
|5nn non-normalized classification|69.047  |78.571  | 69.047 | 64.285 | 59.523 | 68.095   |
|7nn non-normalized classification|64.285  |57.142  | 64.285 | 69.047 | 69.047 | 64.762   |
|9nn non-normalized classification|69.047  |54.761  | 52.380 | 64.285 | 66.666 | 61.429   |
|1nn normalized classification |61.904  |78.571  | 50.0 | 69.047 | 78.571 | 67.619   |
|3nn normalized classification|76.190  |59.523  | 78.571 | 57.142 | 61.904 | 66.667   |
|5nn normalized classification|61.904  |61.904  | 64.285 | 73.809 | 64.285 | 65.238   |
|7nn normalized classification|69.047  |57.142  | 80.952 | 61.904 | 73.809 | 68.571   |
|9nn normalized classification|69.047  |61.904  | 52.380 | 73.809 | 69.047 | 65.238   |
|w-1nn non-normalized classification |83.333  |57.142  | 64.285 | 76.190 | 80.952 | 72.381|
|w-3nn non-normalized classification|78.571  |80.952  | 69.047 | 66.666 | 66.666 | 72.381  |
|w-5nn non-normalized classification|69.047  |78.571  | 69.047 | 66.666 | 61.904 | 69.048  |
|w-7nn non-normalized classification|64.285  |64.285  | 64.285 | 66.666 | 71.428 | 66.190  |
|w-9nn non-normalized classification|73.809  |59.523  | 59.523 | 71.428 | 69.047 | 66.667  |
|w-1nn normalized classification |61.904  |78.571  | 50.0 | 69.047 | 78.571 | 67.619|
|w-3nn normalized classification|76.190  |59.523  | 78.571 | 57.142 | 61.904 | 66.667  |
|w-5nn normalized classification|66.666  |64.285  | 66.666 | 73.809 | 66.666 | 67.619  |
|w-7nn normalized classification|73.809  |59.523  | 80.952 | 66.666 | 71.428 | 70.476  |
|w-9nn normalized classification|71.428  |66.666  | 59.523 | 78.571 | 71.428 | 69.524  |
|1nn non-normalized  regression|6.189  |6.334  | 7.249 | 6.506 | 6.467 | 6.549   |
|3nn non-normalized regression|6.117  |6.330  | 7.005 | 6.574 | 6.850 | 6.576   |
|5nn non-normalized regression|5.824  |7.213  | 7.348 | 7.285 | 6.9308 | 6.921   |
|7nn non-normalized regression|6.055  |7.710  | 7.455 | 7.490 | 7.404 | 7.223   |
|9nn non-normalized regression|6.418  |7.719  | 7.514 | 7.827 | 7.739 | 7.444   |
|1nn normalized  regression|6.189  |6.334  | 7.249 | 6.506 | 6.467 | 6.549   |
|3nn normalized regression|6.117  |6.330  | 7.005 | 6.5741 | 6.850 | 6.576   |
|5nn normalized regression|5.824  |7.213  | 7.348 | 7.285 | 6.930 | 6.921   |
|7nn normalized regression|6.055  |7.710  | 7.455 | 7.490 | 7.404 | 7.223   |
|9nn normalized regression|6.418  |7.719  | 7.514 | 7.827 | 7.739 | 7.444   |
|w-1nn non-normalized  regression|6.534  |5.367  | 7.097 | 6.224 | 5.956 | 6.236   |
|w-3nn non-normalized regression|6.827  |5.769  | 6.058 | 5.981 | 5.773 | 6.082   |
|w-5nn non-normalized regression|5.751  |5.589  | 6.735 | 5.601 | 5.664 | 5.869   |
|w-7nn non-normalized regression|5.772  |5.719  | 6.736 | 5.657 | 5.702 | 5.918   |
|w-9nn non-normalized regression|5.759  |5.754  | 6.700 | 5.680 | 5.693 | 5.918   |
|w-1nn normalized  regression|6.189  |6.334  | 7.249 | 6.506 | 6.467 | 6.549   |
|w-3nn normalized regression|6.117  |6.330  | 7.005 | 6.574 | 6.850 | 6.576   |
|w-5nn normalized regression|5.824  |7.213  | 7.348 | 7.285 | 6.930 | 6.921   |
|w-7nn normalized regression|6.055  |7.710  | 7.455 | 7.490 | 7.404 | 7.223   |
|w-9nn normalized regression|6.418  |7.719  | 7.514 | 7.827 | 7.739 | 7.444   |

Followings are the results of the regression part with normalized/not normalized datasets, weighted/normal knn used and the k of nn used from 1 to 9.

Firstly, i have noticed the sigma values importance in the weighted knn since the results were NaN if it was too low. This makes me understand that the gaussian was too sharply increasing which ended up giving multiply/divide by zero result and created NaN output. I have played with the sigma values from 1 to 33 to see the difference and used 13 lastly which are used in the tested data results as well.

Secondly, i have seen the Mean absolute error was incresing with normalized dataset which may be related to overfitting or not enough information is taken or there may be residual noise. This shows that the data must be invested more deeply to see if anything can be done.

Lastly, the nearest neighbour values given the program increased the accuracy of the not normalized folds of the dataset's results, but the opposite has occured with the normalized folds of the dataset. This shows the effect of the weights and the nearest neighbours given the dataset. For this part, since the accuracy of the smaller k was higher, we can say that k value must be lower for normalized weighted knn for regression and higher for the not normalized weighted knn regression.

For the knn of the not weighted regression part, the normalization did not have an effect on the results for our tests. This might be a sign that the weighted regressions change of normalized dataset might be due to the weights change. Other than that, the results shows that the lower k value for knn works better with the programs results, so the knn's k must be lower.

Test set length is 206 as can be seen from the followings.
Test set length for classification is 42.

1NN-REGRESS: [6.18907766990291, 6.334223300970872, 7.24941747572815, 6.506747572815539, 6.467135922330095]
Mean Absolute Error over 5 folds: 6.549%  len(actual)206
len(predicted)206
3NN-REGRESS: [6.11742718446602, 6.330922330097083, 7.005388349514564, 6.5741100323624595, 6.850792880258901]
Mean Absolute Error over 5 folds: 6.576%
5NN-REGRESS:  [5.824592233009711, 7.213553398058253, 7.348611650485441, 7.2851747572815535, 6.930873786407765]
Mean Absolute Error over 5 folds: 6.921%
7NN-REGRESS: [6.055048543689317, 7.7104854368932, 7.455755894590844, 7.490277392510404, 7.404861303744797]
Mean Absolute Error over 5 folds: 7.223%
9NN-REGRESS:  [6.418322545846825, 7.719471413160735, 7.51416396979504, 7.827713052858686, 7.739611650485441]
Mean Absolute Error over 5 folds: 7.444%

1NN-REGRESS-NORMALIZED: [6.18907766990291, 6.334223300970872, 7.24941747572815, 6.506747572815539, 6.467135922330095]
Mean Absolute Error over 5 folds: 6.549%
3NN-REGRESS-NORMALIZED: [6.11742718446602, 6.330922330097083, 7.005388349514564, 6.5741100323624595, 6.850792880258901]
Mean Absolute Error over 5 folds: 6.576%
5NN-REGRESS-NORMALIZED: [5.824592233009711, 7.213553398058253, 7.348611650485441, 7.2851747572815535, 6.930873786407765]
Mean Absolute Error over 5 folds: 6.921%
7NN-REGRESS-NORMALIZED: [6.055048543689317, 7.7104854368932, 7.455755894590844, 7.490277392510404, 7.404861303744797]
Mean Absolute Error over 5 folds: 7.223%
9NN-REGRESS-NORMALIZED: [6.418322545846825, 7.719471413160735, 7.51416396979504, 7.827713052858686, 7.739611650485441]
Mean Absolute Error over 5 folds: 7.444%

W-1NN-REGRESS-SIGMA13: [6.534223300970872, 5.367718446601936, 7.097378640776697, 6.224320388349516, 5.956310679611649]
Mean Absolute Error over 5 folds: 6.236%  TESTLENlen(actual)206len(predicted)206
W-3NN-REGRESS-SIGMA13: [6.827666203320355, 5.769247399492046, 6.058359499635622, 5.981974632804876, 5.773720404917418]
Mean Absolute Error over 5 folds: 6.082%  TESTLENlen(actual)206len(predicted)206
W-5NN-REGRESS-SIGMA13: [5.751891957484146, 5.589408043621229, 6.735197398163901, 5.6018279820702706, 5.66446546796987]
Mean Absolute Error over 5 folds: 5.869%  TESTLENlen(actual)206len(predicted)206
W-7NN-REGRESS-SIGMA13: [5.772153735840744, 5.719477453333139, 6.73689808643413, 5.657603921706346, 5.702888511421789]
Mean Absolute Error over 5 folds: 5.918%  TESTLENlen(actual)206len(predicted)206
W-9NN-REGRESS-SIGMA13: [5.759090328991352, 5.754029501069371, 6.700971001729926, 5.68084465515428, 5.693954077208931]
Mean Absolute Error over 5 folds: 5.918%

W-1NN-REGRESS-SIGMA13-NORMALIZED: [6.18907766990291, 6.334223300970872, 7.24941747572815, 6.506747572815537, 6.467135922330095]
Mean Absolute Error over 5 folds: 6.549%  TESTLENlen(actual)206len(predicted)206
W-3NN-REGRESS-SIGMA13-NORMALIZED: [6.117400215376787, 6.330900400859135, 7.005365428185682, 6.574047101539171, 6.850742843943034]
Mean Absolute Error over 5 folds: 6.576%  TESTLENlen(actual)206len(predicted)206
W-5NN-REGRESS-SIGMA13-NORMALIZED: [5.824583538628261, 7.213478652955402, 7.348613554367653, 7.28506761641721, 6.930854265889408]
Mean Absolute Error over 5 folds: 6.921%  TESTLENlen(actual)206len(predicted)206
W-7NN-REGRESS-SIGMA13-NORMALIZED: [6.055019074556912, 7.710397711022147, 7.455749072424086, 7.490199244768782, 7.404802044829571]
Mean Absolute Error over 5 folds: 7.223%  TESTLENlen(actual)206len(predicted)206
W-9NN-REGRESS-SIGMA13-NORMALIZED: [6.418255722902209, 7.719439535702919, 7.514160622915275, 7.827615862623473, 7.739534914184686]
Mean Absolute Error over 5 folds: 7.444%  TESTLENlen(actual)206len(predicted)206