In [1]:
import pandas as pd
import numpy as np
from numpy import log,dot,exp,shape
import matplotlib.pyplot as plt
import copy, math
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('WaterQuality.csv')
safe = df[df['is_safe']==1]
notsafe = df[df['is_safe']==0]
notsafe = notsafe.sample(n=len(safe), random_state=101)
df = pd.concat([safe,notsafe],axis=0)
df=(df-df.min())/(df.max()-df.min())
df['is_safe'] = df['is_safe'].astype(int)
df

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,0.329341,0.304931,0.038835,0.588843,0.053846,0.040416,0.922222,0.085427,0.033333,0.20,...,0.270,0.814177,0.391003,0.7,0.631905,0.850690,0.8,0.68,0.222222,1
1,0.463074,0.710164,0.009709,0.683884,0.015385,0.609700,0.755556,0.331658,0.600000,0.65,...,0.500,0.101772,0.667820,0.3,0.540007,0.402760,0.8,0.54,0.555556,1
3,0.271457,0.380409,0.038835,0.611570,0.007692,0.834873,0.033333,0.834171,0.720000,0.71,...,0.080,0.071392,0.446367,0.4,0.152662,0.215809,0.2,0.90,0.555556,1
4,0.183633,0.816505,0.029126,0.041322,0.046154,0.308314,0.766667,0.286432,0.406667,0.13,...,0.585,0.341266,0.384083,0.3,0.282893,0.302384,0.2,0.12,0.222222,1
5,0.187625,0.485743,0.029126,0.595041,0.023077,0.092379,0.477778,0.693467,0.073333,0.67,...,0.675,0.493671,0.653979,0.6,0.454804,0.680050,0.8,0.38,0.222222,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1927,0.762475,0.575646,0.786408,0.770661,0.692308,0.282910,0.877778,0.864322,0.486667,0.81,...,0.775,0.533165,0.667820,0.1,0.787580,0.593476,0.7,0.98,1.000000,0
7386,0.005988,0.062731,0.009709,0.179752,0.384615,0.010393,0.066667,0.432161,0.073333,0.85,...,0.505,0.350380,0.377163,0.8,0.157516,0.062735,0.4,0.04,0.666667,0
7875,0.009980,0.003690,0.067961,0.214876,0.538462,0.087760,0.044444,0.015075,0.913333,0.00,...,0.805,0.148354,0.802768,0.9,0.044191,0.225847,0.4,0.10,0.111111,0
4500,0.009980,0.870178,0.058252,0.173554,0.461538,0.017321,0.033333,0.020101,0.193333,0.00,...,0.975,0.418734,0.204152,0.7,0.055239,0.393977,0.6,0.20,0.222222,0


In [3]:
np.random.seed(200)
df_train, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df))])

In [4]:
X_train = df_train.drop(['is_safe'], axis=1).values
Y_train = df_train['is_safe'].values
X_test = df_test.drop(['is_safe'], axis=1).values
Y_test = df_test['is_safe'].values

In [5]:
def manhattenDistance(X1, X2):
    sum = 0
    for i in range(len(X1)):
        sum += abs(X1[i]-X2[i])
    return sum

In [23]:
def eucludienDistance(X1, X2):
    sum = 0
    for i in range(len(X1)):
        sum += (X1[i]-X2[i])**2
    return math.sqrt(sum)

In [7]:
def infinityNorm(X1, X2):
    max = 0
    for i in range(len(X1)):
        diff = abs(X1[i]-X2[i])
        if (diff > max):
            max = diff
    return max

In [14]:
def findNMins(nums, k = 1):
    return sorted(nums, reverse = False)[:k]

In [15]:
def mostFrequent(labels):
    return max(set(labels), key = labels.count)

In [16]:
def predict(X, k, norm):
    preds = []
    for x2 in X:
        dist = {}
        for i in range(len(X_train)):
            x1 = X_train[i]
            distance = norm(x1, x2)
            dist[distance] = Y_train[i]
        distances = dist.keys()
        minDists = findNMins(distances, k)
        labels = []
        for i in range(k):
            labels.append(dist[minDists[i]])
        preds.append(mostFrequent(labels))
    return preds

In [29]:
preds = predict(X_test, 3, manhattenDistance)
acc = accuracy_score(Y_test, preds)
acc

0.7780821917808219

In [30]:
preds = predict(X_test, 3, eucludienDistance)
acc = accuracy_score(Y_test, preds)
acc

0.7808219178082192

In [31]:
preds = predict(X_test, 3, infinityNorm)
acc = accuracy_score(Y_test, preds)
acc

0.7643835616438356