In [None]:
# K Nearest Neighbor Classification Algorithm from Scratch using Abalone case study

In [1]:
# load csv file
from csv import reader
def load_csv(filename):
    dataset=list()
    open_file=open(filename)
    read_file=reader(open_file)
    for row in read_file:
        if not row:
            continue
        dataset.append(row)
    return dataset

In [2]:
# Converting String column values to float values
def convert_str_to_float(dataset,column):
    for row in dataset:
        row[column]=float(row[column])

In [3]:
# Convert categorical value into string integer value
def convert_str_to_int(dataset,column):
    class_value=[row[column] for row in dataset]
    unique=set(class_value)
    unique_value=dict()
    for i,value in enumerate(unique):
        unique_value[value]=i
    for row in dataset:
        row[column]=unique_value[row[column]]
    return unique_value

In [4]:
# Calculating min and max for each column
def value_minmax(dataset):
    minmax=list()
    for i in range(len(dataset[0])):
        column_values=[row[i] for row in dataset]
        min_value=min(column_values)
        max_value=max(column_values)
        minmax.append([min_value,max_value])
    return minmax

In [5]:
# Calculate Normalize Scale for each value
def normalize_scale(dataset,minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i]=(row[i]-minmax[i][0])/(minmax[i][1]-minmax[i][0])

In [6]:
# Evaluate model accuracy by split dataset into kfold cross validation
from random import randrange
def KFold(dataset,folds):
    fold_values=list()
    dataset_copy=list(dataset)
    fold_size=int(len(dataset)/folds)
    for _ in range(folds):
        fold=list()
        while len(fold)<folds:
            index=randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        fold_values.append(fold)
    return fold_values

In [7]:
# Evaluate accuracy of model by using accuracy metrics
def classification_accuracy(actual,predicted):
    correct=0
    for i in range(len(actual)):
        if actual[i]==predicted[i]:
            correct+=1
    return correct/float(len(actual))*100.0

In [8]:
# Evaluate accuray of model by using evaluate technique
def evaluate_model(dataset,algorithm,folds,*args):
    folds=KFold(dataset,folds)
    predictions=list()
    for fold in folds:
        train_set=list(folds)
        train_set.remove(fold)
        train_set=sum(train_set,[])
        test_set=list()
        for row in fold:
            row_copy=list(row)
            row_copy[-1]=None
            test_set.append(row_copy)
        predicted=algorithm(train_set,test_set,*args)
        actual=[row[-1] for row in fold]
        accuracy=classification_accuracy(actual,predicted)
        predictions.append(accuracy)
    return predictions

In [9]:
# Building K-Nearest Neigbhor classification algorithm for Abalone case study
def k_nearest_neighbor(train,test,neighbors):
    predictions=list()
    for row in test:
       # print("row",row)
        predict=predicted_classification(train,row,neighbors)
       # print("predict",predict)
        predictions.append(predict)
       # print(predictions)
    return predictions

In [10]:
# Making predictions with neighbors
def predicted_classification(train,test_row,neighbors):
    # locate most similar neighbors
    neighbors=get_neighbors(train,test_row,neighbors)
   # print("neighbors",neighbors)
    output_values=[row[-1] for row in neighbors]
   # print("output_values",output_values)
    predictions=max(set(output_values),key=output_values.count)
   # print("predictions",predictions)
    return predictions

In [11]:
# Locate most similar neighbors
def get_neighbors(train,test_row,neighbors):
    distance=list()
    for train_row in train:
    #    print("train_row",train_row)
        # Calculating euclidean distance for 2 vectors
        dist=euclidean_distance(test_row,train_row)
     #   print("dist",dist)
        # Storing each row distance and row value in distance list variable
        distance.append((train_row,dist))
    #    print("distance",distance)
    # Sort distance list as a tuple
    distance.sort(key=lambda tup : tup[1])
    no_neighbors=list()
    for i in range(neighbors):
     #   print("i",i)
        no_neighbors.append(distance[i][0])
     #   print("no_neighbors",no_neighbors)
    return no_neighbors

In [12]:
# Calculating euclidean distance for 2 vectors
from math import sqrt
def euclidean_distance(train_row,test_row):
    distance=0.0
    for i in range(len(train_row)-1):
      #  print("i",i)
     #   print("range(len(train_row)-1)",range(len(train_row)-1))
        distance+=(train_row[i]-test_row[i])**2
     #   print("euclidean_distance",distance)
    return sqrt(distance)

In [13]:
# K Nearest Neighbor algorithm
from random import seed
seed(1)
filename='abalone.csv'
dataset=load_csv(filename)
convert_str_to_int(dataset,0)
for i in range(len(dataset[0])):
    convert_str_to_float(dataset,i)
minmax=value_minmax(dataset)
normalize_scale(dataset,minmax)
folds=5
neighbors=5
accuracy=evaluate_model(dataset,k_nearest_neighbor,folds,neighbors)
print("accuracy",accuracy)

('accuracy', [20.0, 40.0, 20.0, 20.0, 20.0])


In [14]:
#mean accuracy
print("mean accuracy",sum(accuracy)/float(len(accuracy)))

('mean accuracy', 24.0)
