In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
dataset = pd.read_csv('C:/Users/andre/Desktop/AI/Research/clf-data.csv')

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
id       300 non-null int64
h        300 non-null float64
w        300 non-null float64
a        300 non-null float64
class    300 non-null int64
dtypes: float64(3), int64(2)
memory usage: 11.8 KB


In [6]:
numColumns = len(dataset.columns)
numColumns

5

In [7]:
numAttributes = numColumns - 2
numAttributes

3

In [8]:
numRows = len(dataset.index)
numRows

300

In [9]:
numClasses = dataset.iloc[:, numColumns - 1].nunique()
numClasses

3

In [16]:
# Subsetting data by column values (classes), storing subset dataframes in a dictionary with class names as keys
active_classes = dataset.iloc[:,numColumns - 1].unique().tolist()
classDictionary = {}
for className in active_classes:
    classDictionary.update({className: dataset.loc[dataset.iloc[:,-1] == className, :].drop(dataset.columns[-1], axis = 1).drop(dataset.columns[0], axis = 1)})

classDictionary

{1:             h           w          a
 0   61.386299  156.581963  27.829029
 1   73.107843  152.266856  27.754194
 2   74.855714  142.023577  28.534912
 3   57.279102  139.932170  16.087353
 4   65.540263  155.856019  32.215404
 5   75.709641  140.901360  31.824431
 6   57.268319  152.429412  25.561117
 7   66.010025  145.025775  20.325381
 8   69.556493  160.570735  22.928238
 9   66.347068  143.554555  33.228277
 10  57.947815  131.828652  26.937796
 11  65.165198  149.871851  18.745625
 12  68.324181  141.248330  18.455567
 13  57.959878  156.916624  22.472236
 14  71.505264  152.512082  22.454778
 15  74.032866  174.999546  25.666502
 16  65.540105  147.385015  33.673906
 17  61.817281  138.855060  31.636271
 18  65.540371  139.251287  28.740126
 19  73.328549  152.914933  23.203535
 20  68.746320  136.019795  20.659755
 21  62.655386  163.841456  29.042262
 22  70.097841  144.251870  24.166927
 23  49.869806  148.929677  20.354046
 24  60.336925  163.798501  22.851522
 25  72.0

In [18]:
# Making new dictionary to hold class names as keys and mean vectors as values
meanDictionary = {}
for i in classDictionary:
    # Converting each class's subset data to numpy array
    classData = classDictionary[i].to_numpy()
    
    meanVector = np.mean(classData, axis = 0)
        
    #Adding key-value pair
    meanDictionary[i] = meanVector
    
meanDictionary

{1: array([ 65.95367815, 149.41398927,  25.08399293]),
 2: array([ 62.42241549, 119.84386384,  24.5865996 ]),
 3: array([48.18092478, 80.31392899,  8.00126429])}

In [21]:
# Reading in csv test set
testSet = pd.read_csv('C:/Users/andre/Desktop/AI/Research/clf-data.csv')

In [22]:
attributes = testSet.iloc[:,1:-1]
attributes = attributes.to_numpy()
attributes

array([[ 61.3862987 , 156.5819631 ,  27.82902894],
       [ 73.10784274, 152.2668556 ,  27.75419412],
       [ 74.855714  , 142.0235767 ,  28.53491206],
       [ 57.27910161, 139.9321704 ,  16.08735256],
       [ 65.54026263, 155.8560188 ,  32.21540374],
       [ 75.70964122, 140.9013599 ,  31.82443146],
       [ 57.26831901, 152.4294117 ,  25.56111655],
       [ 66.01002537, 145.025775  ,  20.32538068],
       [ 69.55649329, 160.5707349 ,  22.92823779],
       [ 66.34706822, 143.5545551 ,  33.22827654],
       [ 57.94781467, 131.8286519 ,  26.9377956 ],
       [ 65.16519814, 149.871851  ,  18.7456253 ],
       [ 68.32418094, 141.2483302 ,  18.45556667],
       [ 57.95987779, 156.916624  ,  22.47223582],
       [ 71.50526433, 152.5120816 ,  22.45477794],
       [ 74.03286627, 174.9995455 ,  25.66650208],
       [ 65.54010477, 147.385015  ,  33.67390639],
       [ 61.81728115, 138.8550595 ,  31.63627129],
       [ 65.54037053, 139.2512871 ,  28.7401255 ],
       [ 73.32854865, 152.91493

In [23]:
def euclideanDistance(vector1, vector2):
    dist = np.linalg.norm(vector1 - vector2)
    return dist

In [24]:
# Function takes an an array of attributes and outputs the predicted class, using the min means algorithm to do the predicting
def predictedClass(attributeArray):
    minDist = euclideanDistance(attributeArray, list(meanDictionary.values())[0])
    minKey = list(meanDictionary.keys())[0]
    for i in meanDictionary:
        if (euclideanDistance(attributeArray, meanDictionary[i]) < minDist):
            minDist = euclideanDistance(attributeArray, meanDictionary[i])
            minKey = i
            
    return minKey

In [25]:
# Calls the optimum class function for each instance in the test set and stores preditions in an array
y_pred = []
for entry in attributes:
    y_pred.append(predictedClass(entry))
y_pred = np.asarray(y_pred)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [26]:
# Getting test set actual class labels
df = testSet.iloc[:,-1]
y_actual = df.to_numpy()
y_actual

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], d

In [27]:
# Computing Accuracy
numCorrect = 0
for i in range(y_pred.size):
    if (y_pred[i] == y_actual[i]):
        numCorrect += 1
        
print("Accuracy is " + str(numCorrect/y_pred.size))

Accuracy is 0.9766666666666667
