In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

### Loading dataset

In [2]:
data=load_iris()
X=data.data
y=data.target

In [3]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### About dataset

The above dataset contains the information about three species(['setosa', 'versicolor', 'virginica']) of iris flower. Each data point conatins four features('sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)') of a single specicies. We have to predict the species name using these four features

### Split dataset into train and test set

In [25]:
train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.25,random_state=100)

### Let's scale our dataset inorder to get a better performance

In [26]:
sc=StandardScaler()
sc.fit(train_X,train_y)
train_X=sc.transform(train_X)
test_X=sc.transform(test_X)

### Let's apply logisitic Regression algorith with default parameter

In [27]:
lr=LogisticRegression()
lr.fit(train_X,train_y)

LogisticRegression()

make prediction on test set

In [28]:
pred=lr.predict(test_X)

In [29]:
print("Accuracy of the algorithm on train set ",accuracy_score(train_y,lr.predict(train_X)))
print("Accuracy of the algorithm on test set",accuracy_score(test_y,pred))
print("Classification report \n",classification_report(test_y,pred),"\n")
print("Confusion matrix \n",confusion_matrix(test_y,pred))

Accuracy of the algorithm on train set  0.9553571428571429
Accuracy of the algorithm on test set 0.9473684210526315
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.90      0.90      0.90        10
           2       0.93      0.93      0.93        14

    accuracy                           0.95        38
   macro avg       0.94      0.94      0.94        38
weighted avg       0.95      0.95      0.95        38
 

Confusion matrix 
 [[14  0  0]
 [ 0  9  1]
 [ 0  1 13]]


### Let's apply SVM algorithm on this dataset

In [30]:
svm_clf=SVC()
svm_clf.fit(train_X,train_y)

SVC()

make prediction on test set

In [31]:
pred=svm_clf.predict(test_X)

In [32]:
print("Accuracy of the algorithm on train set ",accuracy_score(train_y,svm_clf.predict(train_X)))
print("Accuracy of the algorithm on test set",accuracy_score(test_y,pred))
print("Classification report \n",classification_report(test_y,pred),"\n")
print("Confusion matrix \n",confusion_matrix(test_y,pred))

Accuracy of the algorithm on train set  0.9821428571428571
Accuracy of the algorithm on test set 0.9736842105263158
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.91      1.00      0.95        10
           2       1.00      0.93      0.96        14

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38
 

Confusion matrix 
 [[14  0  0]
 [ 0 10  0]
 [ 0  1 13]]


### Let's apply KNearest Neighbors algorithm on this dataset

In [15]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_X,train_y)

KNeighborsClassifier()

make prediction on test set

In [16]:
pred=knn.predict(test_X)

In [17]:
print("Accuracy of the algorithm on train set ",accuracy_score(train_y,knn.predict(train_X)))
print("Accuracy of the algorithm on test set",accuracy_score(test_y,pred))
print("Classification report \n",classification_report(test_y,pred),"\n")
print("Confusion matrix \n",confusion_matrix(test_y,pred))

Accuracy of the algorithm on train set  0.9732142857142857
Accuracy of the algorithm on test set 0.9736842105263158
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.91      1.00      0.95        10
           2       1.00      0.93      0.96        14

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38
 

Confusion matrix 
 [[14  0  0]
 [ 0 10  0]
 [ 0  1 13]]


### Let's apply neural network on this dataset

In [18]:
neural_clf=MLPClassifier(max_iter=500)
neural_clf.fit(train_X,train_y)

MLPClassifier(max_iter=500)

make prediction on test set

In [20]:
pred=neural_clf.predict(test_X)

In [18]:
print("Accuracy of the algorithm on train set ",accuracy_score(train_y,neural_clf.predict(train_X)))
print("Accuracy of the algorithm on test set",accuracy_score(test_y,pred))
print("Classification report \n",classification_report(test_y,pred),"\n")
print("Confusion matrix \n",confusion_matrix(test_y,pred))

Accuracy of the algorithm on train set  0.9821428571428571
Accuracy of the algorithm on test set 0.9473684210526315
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.90      0.90      0.90        10
           2       0.93      0.93      0.93        14

    accuracy                           0.95        38
   macro avg       0.94      0.94      0.94        38
weighted avg       0.95      0.95      0.95        38
 

Confusion matrix 
 [[14  0  0]
 [ 0  9  1]
 [ 0  1 13]]


### Conclusion

By applying LogisticRegression, Support Vector Machine, Knearestneighbors and neural netwrok algorithms, It has been observed that out of all algorithsm , KNN and SVM performed equally well on test set, only SVM has a little high score on train set. Hence it is overfitting a bit on train set. Hence KNN with 5 nearest neighbors is best for our use case.