# Feature selection with backward elimination

In [None]:
import pandas as pd # for importing dataset
import numpy as np 
import matplotlib as plt


In [None]:
#Importing the dataset
df=pd.read_csv('abalone.csv')
print("Shape:\n",df.shape)
print("")
#First 5 instances of the dataset
print("First 5 rows:\n",df.head())
print("")
#this is initializing x with 8 given attributes and its instances
x=df.iloc[:,:-1]
x.shape
print("X:\n",x.head())
print("")
#this is initializing y with classlabel and its instances
y=df.iloc[:,8]
y.shape
print("Y:\n",y.head())
print("")

In [None]:
#transforming column sex that is a categorical data into numerical data

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('sex', OneHotEncoder(), [0])], remainder='passthrough')
x = columnTransformer.fit_transform(x)
pd.DataFrame(x)


In [None]:
#Avoiding the dummy variable trap:
x=x[:,1:]
#printing entire day
pd.DataFrame(x)

In [None]:
#Splitting the dataset into training and test set.
from sklearn.model_selection import train_test_split
#70% training (x) and 30% is test (y)
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.3,random_state=0)


In [None]:
from sklearn.linear_model import LinearRegression
mlr=LinearRegression()
mlr.fit(x_train,y_train)


y_pred=mlr.predict(x_test)
from sklearn.metrics import r2_score
print("R2 Score:",r2_score(y_test,y_pred)*100)

#by using Linear Regression R2 Score we get 52.48%
# using Linear Regression we are applying all independent variables
# (All independent values) x = D1,D2,length,diameter,height,whole weight,shucked weight,viscera weight,shall weight

In [None]:
#Now from all independent values we will find the best independent feature

#Backward Elimination:

import statsmodels.api as sm
# Doing this because the bias value is not being assign automatically in BE as in LR y=mx+b the b(intercept) is assigned
# automatically.
X=np.append(arr=np.ones((4177,1)).astype(int), values=x,axis=1)
pd.DataFrame(X)

In [None]:
#taking all rows and all column
X_opt=X[:,:]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()
#Have to remove 2 features one by one as fulling the condition P>0.05
#choosing the highest p value that is 0.8 x3 that is length and removing it

In [None]:
#removed the Length column which was x3 because it P value was higher than 0.05
#x0,x1,x2,x4,x5,x6,x7,x8,x9
#Intercept,D1,D2,diameter,height,whole weight,shucked weight,viscera weight,shall weight
X_opt=X[:,[0,1,2,4,5,6,7,8,9]]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()
#As x2 which is Dummy variable 2 needs to be removed as its p value is 0.49

In [None]:
#removed the D2 column which was x2 because it P value was higher than 0.05
X_opt=X[:,[0,1,4,5,6,7,8,9]]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

In [None]:
#the remaining columns 
#x0,x1,x2,x4,x5,x6,x7
#Intercept,D1,diameter,height,whole weight,shucked weight,viscera weight,shall weight

#Importing the dataset
df1=pd.read_csv('abalone.csv')
df1.shape

In [None]:
df1.head()

In [None]:
df2=df1.drop(["length"],axis='columns')
df2.head()

In [None]:
target_feature='rings'

#Separate object for target feature
y=df2[target_feature]

#Seperate object from input features
X=df2.drop(target_feature,axis=1)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('sex', OneHotEncoder(), [0])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
pd.DataFrame(X)

In [None]:
#Avoiding the dummy variable trap:
X=X[:,1:]
#printing entire day
pd.DataFrame(X)

In [None]:
X=X[:,[0,2,3,4,5,6,7]]
pd.DataFrame(X)


In [None]:
from sklearn.model_selection import train_test_split
#80% training (x) and 20% is test (y)
x_train,x_test,y_train,y_test=train_test_split(X,y, test_size=0.3,random_state=0)

In [None]:
from sklearn.linear_model import LinearRegression
regr=LinearRegression()
regr.fit(x_train,y_train)
regr.score(x_test,y_test)

In [None]:
from sklearn.model_selection import cross_val_score
r2Score=cross_val_score(estimator=regr,X=x_train,y=y_train,cv=10)
print("R2 Score Avg:{:.2f}%".format((r2Score.mean()*100)))

# KNN

In [None]:
import pandas as pd # for importing dataset
import numpy as np 
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Importing the dataset
df=pd.read_csv('abalone.csv')
df.shape

In [None]:
#this is initializing x with 8 given attributes and its instances
x=df.iloc[:,:-1]
x.shape
x.head()

In [None]:
#this is initializing y with classlabel and its instances
y=df.iloc[:,8]
y.shape
print(y)

In [None]:
x.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('sex', OneHotEncoder(), [0])], remainder='passthrough')
x = columnTransformer.fit_transform(x)

#Avoiding the dummy variable trap:
x=x[:,1:]
pd.DataFrame(x)

In [None]:
from sklearn.model_selection import train_test_split
#70% training (x) and 30% is test (y)
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.3,random_state=1)
x_train.shape, x_test.shape,y_train.shape,y_test.shape


In [None]:
from sklearn.neighbors import KNeighborsClassifier
#sq(4177) = 64.6 round off to 65
classifier=KNeighborsClassifier(n_neighbors=65)
classifier.fit(x_train,y_train)

In [None]:
y_pred=classifier.predict(x_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
accuracy=cross_val_score(estimator=classifier,X=x_train,y=y_train,cv=10)
print("Accuracy:{:.2f}%".format((accuracy.mean()*100)))

In [None]:
from sklearn.metrics import precision_score
y_pred=classifier.predict(x_test)
print("Precision:{:.2f}%".format(precision_score(y_test,y_pred,average="micro")*100))

In [None]:
from sklearn.metrics import recall_score
y_pred=classifier.predict(x_test)
print("Recall:{:.2f}%".format(recall_score(y_test,y_pred,average='micro')*100))

In [None]:
from sklearn.metrics import recall_score
y_pred=classifier.predict(x_test)
print("Sensitivity:{:.2f}%".format(recall_score(y_test,y_pred,average='micro')*100))


# ID3

In [None]:
import pandas as pd # for importing dataset
import numpy as np 
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')

#Importing the dataset
df=pd.read_csv('abalone.csv')
df.shape

In [None]:
#this is initializing x with 8 given attributes and its instances
x=df.iloc[:,:-1]
x.shape
x.head()

In [None]:
#this is initializing y with classlabel and its instances
y=df.iloc[:,8]
y.shape
print(y)

In [None]:
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.compose import ColumnTransformer
#columnTransformer = ColumnTransformer([('sex', OneHotEncoder(), [0])], remainder='passthrough')
#x = columnTransformer.fit_transform(x)

#Avoiding the dummy variable trap:

#pd.DataFrame(x)

In [None]:
#x=x[:,1:]
#pd.DataFrame(x)

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_x=LabelEncoder()
x.iloc[:,0]= labelencoder_x.fit_transform(x.iloc[:,0])

In [None]:
from sklearn.model_selection import train_test_split
#70% training (x) and 30% is test (y)
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.3,random_state=1)
x_train.shape, x_test.shape,y_train.shape,y_test.shape


In [None]:
feature_names = ["sex","length","diameter","height","whole weight","shucked weight","viscera weight","shall weight"]


In [None]:
import six
import sys
sys.modules['sklearn.externals.six'] = six
from id3 import Id3Estimator
from id3 import export_graphviz
import matplotlib.pyplot as plt

estimator = Id3Estimator()
estimator.fit(x_train, y_train)
export_graphviz(estimator.tree_, 'tree.dot',feature_names)
predictions = estimator.predict(x_test) 
print(x_test)
print("Predicted class label for new Data point : " ,predictions[0]);

from graphviz import render

render('dot', 'png', 'E:/Softwares/OneDrive/Desktop/Machine Learning/Project/Dataset/Abalone/tree.dot')

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_tree = DecisionTreeClassifier(criterion='entropy', random_state=1)
clf_tree.fit(x_train, y_train)


In [None]:
from sklearn import tree
dotfile = open("E:/Softwares/OneDrive/Desktop/Machine Learning/Project/Dataset/Abalone/treeID3.dot", 'w')
tree.export_graphviz(clf_tree, out_file = dotfile)
dotfile.close()


#from graphviz import render

#render('dot', 'png', 'E:/Softwares/OneDrive/Desktop/Machine Learning/Project/Dataset/Abalone/treeID3.dot')

In [None]:
from sklearn.model_selection import cross_val_score
accuracy=cross_val_score(estimator=clf_tree,X=x_train,y=y_train,cv=10)
print("Accuracy:{:.2f}%".format((accuracy.mean()*100)))

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, predictions))

In [None]:
from sklearn.metrics import recall_score
y_pred=estimator.predict(x_test)
print("Sensitivity:{:.2f}%".format(recall_score(y_test,y_pred,average='micro')*100))

In [None]:
from sklearn.metrics import recall_score
y_pred=estimator.predict(x_test)
print("Recall:{:.2f}%".format(recall_score(y_test,y_pred,average='micro')*100))

In [None]:
from sklearn.metrics import recall_score
y_pred=estimator.predict(x_test)
print("Sensitivity:{:.2f}%".format(recall_score(y_test,y_pred,average='micro')*100))

In [None]:
#from sklearn import tree
#fig, ax = plt.subplots(figsize=(100,100))
#tree.plot_tree(clf_tree, fontsize=10)
#plt.show()