In [173]:
#Predicting age of Abalone (kind of snails) from rings
import pandas as pd
url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data")
abalone = pd.read_csv(url, header=None)


In [174]:
#Assigning names to columns
abalone.columns = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings" ]
print(abalone.head())
print('------------------------------------------------------------')
print(abalone.tail())

  Sex  Length  Diameter  Height  Whole weight  Shucked weight  Viscera weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   

   Shell weight  Rings  
0         0.150     15  
1         0.070      7  
2         0.210      9  
3         0.155     10  
4         0.055      7  
------------------------------------------------------------
     Sex  Length  Diameter  Height  Whole weight  Shucked weight  \
4172   F   0.565     0.450   0.165        0.8870          0.3700   
4173   M   0.590     0.440   0.135        0.9660          0.4390   
4174   M   0.600     0.475   0.205        1.1760          0.5255   
4175   F   0.625     0.485   0

In [175]:
import numpy as np
from scipy.stats import mode
import pandas as pd
from collections import Counter
import scipy.stats
def find_mode(arr):
    counter = Counter(arr)
    mode_value = counter.most_common(1)[0][0]
    return int(mode_value)

abalone = abalone.drop("Sex", axis=1)

trainingSet=abalone[0:2923]
trainingSetY = trainingSet["Rings"]
trainingSetX = trainingSet.drop("Rings", axis=1)
trainingSetX = trainingSetX.values


validationSet=abalone[2924:3549]
validationSetY = validationSet["Rings"]
validationSetY.reset_index(drop=True, inplace=True)
validationSetX = validationSet.drop("Rings", axis=1)
validationSetX = validationSetX.values


testSet=abalone[3549:4176]
TestingSetY = testSet["Rings"]
TestingSetY.reset_index(drop=True, inplace=True)
TestingSetX = testSet.drop("Rings", axis=1)
TestingSetX = TestingSetX.values



new_data_point = TestingSetX[0]
correct=0
wrong=0
k=81

for index,testSample in enumerate(TestingSetX):
    new_data_point=testSample
    
    distances = np.linalg.norm(trainingSetX - new_data_point, axis=1)
    nearest_neighbor_ids = distances.argsort()[:k]
    nearest_neighbor_rings = trainingSetY[nearest_neighbor_ids]
    
    prediction = find_mode(nearest_neighbor_rings)
    
    if(prediction==TestingSetY[index]):
        correct+=1
       
    else:
        wrong=wrong+1
        
accuracy=(correct/625)

print(accuracy)



    

0.3344


In [None]:
#Finding details about the dataset
abalone.info()
abalone.describe()

In [10]:
#Dropping sex column as it is non-numeric and irrelavant
abalone = abalone.drop("Sex", axis=1)

In [11]:
#Range of the Ring Column
abalone["Rings"].min()
abalone["Rings"].max()

29

In [12]:
#Making input matrix X and output vector y. Note: .values convert pandas dataframe to NumPy array
X = abalone.drop("Rings", axis=1)
X = X.values
y = abalone["Rings"]
y = y.values

In [13]:
#Making Test sample
import numpy as np
new_data_point = np.array([0.569552, 0.446407, 0.154437, 1.016849, 0.439051, 0.222526, 0.291208])

In [14]:
#Find Euclidean distance between test sample and all training samples
distances = np.linalg.norm(X - new_data_point, axis=1)
#print(distances)

In [15]:
#Sorting and getting top k neighbors. Please note that k is assumed to be sqrt(n)
k = 100
nearest_neighbor_ids = distances.argsort()[:k]
nearest_neighbor_ids

array([4045, 1902, 1644, 1132, 1894, 3915, 3668,  399, 1485,  849,  483,
         75,  489, 3776, 3667, 3981, 3675, 1136, 1792, 3859, 2990,  182,
       4088, 1671, 3673, 3670, 2992, 4161, 3821, 2289, 4160,  412, 2841,
       2032, 3611, 3491, 1365, 2304, 3041,  497, 3415, 3194, 3048, 2523,
       1486, 2839,  982, 2671, 1649,  311, 2925, 1655, 3288, 1002,  431,
       3417, 2922, 3282,  339, 1166, 1901,  993, 1369, 1355,  852, 1342,
       2921, 3340,  721, 2773, 1676, 3881, 2780,  577, 2395, 2192, 4173,
       2363, 1353, 2595,  425, 1668, 1495,  992, 3146, 2130, 2105, 2836,
       3735,  461, 2781, 2908, 4006,  183, 3499, 2919,   30, 1604, 2910,
       1922], dtype=int64)

In [16]:
#Getting Rings values of nearest nieghbors
nearest_neighbor_rings = y[nearest_neighbor_ids]
nearest_neighbor_rings

array([ 9, 11, 10,  9, 11, 11, 10, 11,  8, 10, 19, 15, 14, 13, 11,  8, 10,
        7,  9,  9,  9, 10,  9,  9, 10,  8,  8, 11, 10, 10, 11, 11,  9, 12,
       10,  8, 10, 11,  9, 19,  9, 12,  8,  9,  9, 10,  9,  8, 10, 14, 10,
        8, 15,  9, 20,  9, 11, 13, 15,  9, 10, 10, 10, 11, 10, 10,  9, 17,
       13, 11, 10, 10, 10, 10, 14, 14, 10, 18, 11, 11, 13, 10,  8,  8, 14,
        9, 10,  9,  9, 12, 10,  9,  9, 10,  9, 10, 10, 10, 11,  8],
      dtype=int64)

In [62]:
#Finding most frequent class in neighbors
import scipy.stats
prediction = scipy.stats.mode(nearest_neighbor_rings)
print(type(prediction))

<class 'scipy.stats._stats_py.ModeResult'>
