In [52]:
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [53]:
df = pd.read_csv('abalone.csv')
df.head(10)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [54]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [55]:
X= pd.get_dummies(X, drop_first=True, prefix_sep='_')
X.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,1,0


In [56]:
scaler= MinMaxScaler()
X= scaler.fit_transform(X)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state= 1)

### Naive Bayes using Holdout , test size 0.25

In [58]:
gnb= GaussianNB()
y_prednb=gnb.fit(X_train,y_train).predict(X_test)
metrics.accuracy_score(y_test,y_prednb) 

0.10334928229665072

### KNN, holdout , test size 0.25

In [59]:
knn= KNeighborsClassifier()
y_predknn= knn.fit(X_train,y_train).predict(X_test)
metrics.accuracy_score(y_test,y_predknn)

0.23923444976076555

### Decision tree using gini, holdout, test size as 0.25

In [60]:
dtclf= DecisionTreeClassifier(random_state=1)
dtclf= dtclf.fit(X_train, y_train)
y_pred= dtclf.predict(X_test)

In [61]:
metrics.accuracy_score(y_test, y_pred)

0.20478468899521532

In [62]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state= 1)

### Using Naive Bayes, random subsampling, test size as 0.25

In [63]:
gnb= GaussianNB()
y_prednb1=gnb.fit(X_train1,y_train1).predict(X_test1)
metrics.accuracy_score(y_test1,y_prednb1) 

0.10622009569377991

### Using KNN, random subsampling, test size= 0.25

In [64]:
y_predknn1=knn.fit(X_train1,y_train1).predict(X_test1)
metrics.accuracy_score(y_test1,y_predknn1) 

0.24210526315789474

### Using Decision tree gini and random subsampling, test size as 0.25

In [65]:
dtclf1= DecisionTreeClassifier(random_state=1)
dtclf1= dtclf1.fit(X_train1, y_train1)
y_pred1= dtclf1.predict(X_test1)

In [66]:
metrics.accuracy_score(y_test1, y_pred1)

0.18660287081339713

### Using Naive Bayes and cross validation, test size as 0.25

In [67]:
y_prednb2= cross_val_predict(gnb, X, y, cv=4)
metrics.accuracy_score(y,y_prednb2)

0.11491501077328226

### Using KNN and cross validation, test size= 0.25


In [68]:
y_predknn2= cross_val_predict(knn, X, y, cv=4)
metrics.accuracy_score(y,y_predknn2)

0.22552070864256643

### Using Decision tree gini and cross validation, test size as 0.25

In [69]:
dtclf2= DecisionTreeClassifier(random_state=1)
y_pred2= cross_val_predict(dtclf2, X, y ,cv=4)
metrics.accuracy_score(y, y_pred2)

0.19080679913813742

In [70]:
y_score2= cross_val_score(dtclf2, X, y ,cv=4)
y_score2

array([0.15799432, 0.19447092, 0.21442308, 0.19689622])

In [71]:
np.mean(y_score2)

0.19094613560882173

In [72]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.33, shuffle=False, random_state= 1)

### Naive Bayes using holdout, test size 0.33

In [73]:
gnb1= GaussianNB()
y_prednb3=gnb1.fit(X_train3,y_train3).predict(X_test3)
metrics.accuracy_score(y_test3,y_prednb3) 

0.1065989847715736

### KNN using holdout, test size= 0.33

In [74]:
knn1= KNeighborsClassifier()
y_predknn3=knn1.fit(X_train3,y_train3).predict(X_test3)
metrics.accuracy_score(y_test3,y_predknn3) 

0.23712835387962292

### Decision tree using gini, holdout, test size 0.33

In [75]:
dtclf3= DecisionTreeClassifier(random_state=1)
dtclf3= dtclf3.fit(X_train3, y_train3)
y_pred3= dtclf3.predict(X_test3)

In [76]:
metrics.accuracy_score(y_test3, y_pred3)

0.1986947063089195

In [77]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, test_size=0.33, random_state= 1)

### Naive Bayes using random subsampling, test size= 0.33

In [78]:
gnb2= GaussianNB()
y_prednb4=gnb2.fit(X_train4,y_train4).predict(X_test4)
metrics.accuracy_score(y_test4,y_prednb4) 

0.11457577955039884

### KNN using random subsampling, test size= 0.33

In [79]:
knn2= KNeighborsClassifier()
y_predknn4=knn2.fit(X_train4,y_train4).predict(X_test4)
metrics.accuracy_score(y_test4,y_predknn4) 

0.2226250906453952

### Decision tree using gini, random subsampling, test size=0.33

In [80]:
dtclf4= DecisionTreeClassifier(random_state=1)
dtclf4= dtclf4.fit(X_train4, y_train4)
y_pred4= dtclf4.predict(X_test4)

In [81]:
metrics.accuracy_score(y_test4, y_pred4)

0.19434372733865118

### Naive Bayes using kfold, test size= 0.33

In [82]:
y_prednb5= cross_val_predict(gnb, X, y ,cv=3)
metrics.accuracy_score(y, y_prednb5)

0.1120421355039502

### KNN using kfold, test size= 0.33 (fold size)


In [83]:
y_predknn5= cross_val_predict(knn, X, y ,cv=3)
metrics.accuracy_score(y, y_predknn5)

0.23509695954033996

### Decision tree using gini, kfold , test size= 0.33

In [84]:
dtclf5= DecisionTreeClassifier(random_state=1)
y_pred5= cross_val_predict(dtclf5, X, y ,cv=3)
metrics.accuracy_score(y, y_pred5)

0.19918601867368926

In [85]:
y_score5= cross_val_score(dtclf5, X, y ,cv=3)
y_score5

array([0.18376068, 0.21726619, 0.1966739 ])

In [86]:
np.mean(y_score5)

0.1992335893785667