In [13]:
import math
import pandas as pd
import random

In [14]:
def calc_euclidean_distance(row1, row2):
    if (len(row1) != len(row2)):
        raise Exception("Something is wrong with your data! Both rows are not the same size")
    distance = 0.0
    for i in range(len(row1)-1):
        # assume that last row is value we want to predict
        # use for loop so that we can scale for higher dimensions as well
        distance = distance + (row1[i] - row2[i]) ** 2
    return math.sqrt(distance)

In [15]:
def get_most_similar_neighbors(train, test, k):
    distances = []
    for row in train:
        curr_dist = calc_euclidean_distance(row, test)
        distances.append((row, curr_dist))
    distances.sort(key = lambda x : x[1]) #sorting by distance
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

In [16]:
def make_prediction(train, test, k):
    neighbors = get_most_similar_neighbors(train, test, k)
    last_row = [curr[len(curr)-1] for curr in neighbors]
    model_prediction = max(set(last_row), key=last_row.count)
    return model_prediction

In [17]:
# Preprocessing
def clean_data(dataset):
    dataset['door'] = dataset['door'].replace(['5more'], '5')
    dataset['persons'] = dataset['persons'].replace(['more'], str(random.randint(0,5)))
    
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
        #print('[{}] ==> {}'.format(value, i)) PRINT MAPPINGS 
        #print(lookup[value])
    for row in dataset:
        row[column] = int(lookup[row[column]])
    return lookup

def str_column_to_float(dataset, column):
    for row in dataset:
        if (type(row[column]) == str):
            row[column] = int(row[column])

In [22]:
def KNN():
    ## NOTE TO SELF REPLACE ALL TRAINS WITH TESTS
    dataset = pd.read_csv('car.csv')
    train_dataset = pd.read_csv('car_train.csv')
    
    clean_data(dataset)
    clean_data(train_dataset)
    
    dataset = dataset.values.tolist()
    train_dataset = train_dataset.values.tolist()
    
    #print(dataset)
    #print(type(dataset))
    str_column_to_int(dataset, 0)
    str_column_to_int(dataset, 1)
    str_column_to_int(dataset, 3)
    str_column_to_int(dataset, 4)
    str_column_to_int(dataset, 5)
    str_column_to_int(dataset, len(dataset[0])-1)
    
    str_column_to_int(train_dataset, 0)
    str_column_to_int(train_dataset, 1)
    str_column_to_int(train_dataset, 3)
    str_column_to_int(train_dataset, 4)
    str_column_to_int(train_dataset, 5)
    str_column_to_int(train_dataset, len(dataset[0])-1)
    
    for i in range(len(dataset[0])):
        str_column_to_float(dataset, i)
        str_column_to_float(train_dataset, i)
        
    #print(dataset)
    K = 11
    x = 1
    #print(train_dataset[1])
    
    """
    [vgood] ==> 0
    [unacc] ==> 1
    [acc] ==> 2
    [good] ==> 3
    mappings for predictions
    """
    
    predictions = ["vgood", "unacc", "acc", "good"]
    right = 0
    wrong = 0
    for x in range(len(train_dataset)):
        curr_prediction = make_prediction(dataset, train_dataset[x], K)
        print("Prediction: ", predictions[curr_prediction], " , Actual: ", predictions[train_dataset[x][6]])
        if (predictions[curr_prediction] == predictions[train_dataset[x][6]]):
            right += 1
        else:
            wrong += 1
        x += 1
    print(right / (right + wrong))

KNN()

Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  acc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  ,

Prediction:  acc  , Actual:  good
Prediction:  acc  , Actual:  vgood
Prediction:  unacc  , Actual:  unacc
Prediction:  acc  , Actual:  good
Prediction:  acc  , Actual:  vgood
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  acc  , Actual:  good
Prediction:  unacc  , Actual:  unacc
Prediction:  acc  , Actual:  good
Prediction:  acc  , Actual:  vgood
Prediction:  unacc  , Actual:  unacc
Prediction:  acc  , Actual:  good
Prediction:  acc  , Actual:  vgood
Prediction:  unacc  , Actual:  unacc
Prediction:  unacc  , Actual:  acc
Prediction:  acc  , Actual:  good
Prediction:  unacc  , Actual:  unacc
Prediction:  acc  , Actual:  go

In [None]:
def test():
#     dataset = [[2.7810836,2.550537003,0],
#         [1.465489372,2.362125076,0],
#         [3.396561688,4.400293529,0],
#         [1.38807019,1.850220317,0],
#         [3.06407232,3.005305973,0],
#         [7.627531214,2.759262235,1],
#         [5.332441248,2.088626775,1],
#         [6.922596716,1.77106367,1],
#         [8.675418651,-0.242068655,1],
#         [7.673756466,3.508563011,1]]    

#     Test Function for euclidean dist
#     row0 = dataset[0]
#     for row in dataset:
#         distance = calc_euclidean_distance(row0, row)
#         print(distance)

#     Test Function for getting similar neighbors
#     neighbors = get_most_similar_neighbors(dataset, dataset[0], 3)
#     for neighbor in neighbors:
#         print(neighbor)

    #Test Function for making prediction 
    prediction = make_prediction(dataset, dataset[0], 3)
    print('Expected {} and Got {}.'.format(dataset[0][-1], prediction))

test()

In [45]:
abc = "jef"
if (type(abc) == str):
    print("2")

2


In [None]:
4 5
2 3
1 2
