In [1]:
#IMPORT INDEPENDENCIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
#SETTING THE COLUMN NAMES BECAUSE IT WAS NOT IN DATASET
columns = ['Sex', 'Length', 'Diameter', 'Height','Whole weight','Shucked weight', 'Viscera weight', 'Shell weight','Rings']

In [15]:
#LOADING THE DATA
dataset = pd.read_csv('/content/drive/MyDrive/Datasets/Datasets/abalone.csv', names= columns)


In [16]:
dataset.head(5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [17]:
#CREATING A NEW FEATURE 'AGE' THAT WE WANT TO PREDICT

dataset['age'] = dataset['Rings'] + 1
dataset = dataset.drop(['Rings'], axis=1)

In [18]:
dataset.head(5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,16
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,8
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,10
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,11
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,8


In [19]:
#TURNING THE SEX COLUMN TO NUMERICAL VALUES
dataset.Sex.unique()
dataset.Sex =dataset.Sex.map({"M":2, "F":1, "I":3})

In [20]:
#ASSIGNING DEPENDENT AND INDEPENDENT VARIABLES

X = dataset.iloc[:,:-1].values
y = dataset.iloc[::,-1].values

In [21]:
#CHECKING UNIQUE VALUES IN OUR TARGET VARIABLE
dataset.age.unique()

array([16,  8, 10, 11,  9, 21, 17, 20, 15, 12, 13, 19, 14,  6,  5,  7, 22,
       18, 23,  2,  4, 27, 24, 30,  3, 28, 26, 25])

In [22]:
#SPLIT THE DATASET
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.10, random_state=0)

In [30]:
#LOADING THE DATA
#PLAYING WITH HYPERPARAMETRS THROUGH GRIDSEARCHCV TO GET BEST FIT
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor()
parameters = {
    'n_neighbors': [ 7, 8, 9, 10,15,20 ],
    'metric': ['euclidean', 'minkowski',]
}


grid_search = GridSearchCV(estimator=model, param_grid= parameters, cv=5)
grid_search.fit(X_train, y_train)


In [31]:
#SHOWING THE BEST PARAMETERS
print(grid_search.best_params_)

{'metric': 'euclidean', 'n_neighbors': 15}


In [32]:
y_pred = grid_search.predict(X_test)


In [33]:
grid_search.score(X_test,y_test)

0.5722707921126915

In [36]:
#CHECKING THE MODEL'S ACCURACY

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
accuracy = r2_score(y_test, y_pred)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)


print("R-squared score: {:.2f}".format(accuracy))
print("Mean squared error: {:.2f}".format(mse))
print("Mean absolute error: {:.2f}".format(mae))

R-squared score: 0.57
Mean squared error: 4.64
Mean absolute error: 1.47
