# Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

# Iris Flower Species Prediction

## Data Collection

Features

In [None]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


##Stacking the Target Feature to the dataframe

In [None]:
print(data.target_names)
df['species'] = data.target_names[data.target]
df

['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Data Preprocessing

### Checking for missing values

In [None]:
df.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

### Handling duplicates

In [None]:
df[df.duplicated(keep=False)]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
101,5.8,2.7,5.1,1.9,virginica
142,5.8,2.7,5.1,1.9,virginica


In [None]:
df = df.drop_duplicates()

In [None]:
train_Y_df = df.pop('species')
train_Y_df

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 149, dtype: object

In [None]:
train_X_df = df
train_X_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


#Implementation

##Creating a pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV

pipe = Pipeline(steps=[('scaler', RobustScaler()),
                       ('classifier', LogisticRegression(solver='sag', max_iter=2000))])

##Hyper Parameter Tuning

In [None]:
param_distributions = [
              {
                'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), 'passthrough'],
                'classifier': [LogisticRegression(solver='lbfgs', max_iter=15000)],
                'classifier__penalty': ['l2', 'none'],
                'classifier__multi_class': ['ovr', 'multinomial', 'auto']
              },
              {
                'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), 'passthrough'],
                'classifier': [KNeighborsClassifier()],
                'classifier__n_neighbors': range(1,20),
                'classifier__p': [1, 2, 3, 4]
              }
            ]

grid_search_cv = GridSearchCV(pipe, param_grid=param_distributions, scoring='accuracy', refit=True, cv=5) 
grid_search_cv.fit(train_X_df, train_Y_df)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        RobustScaler(copy=True,
                                                     quantile_range=(25.0,
                                                                     75.0),
                                                     with_centering=True,
                                                     with_scaling=True)),
                                       ('classifier',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
    

##Best Parameters

In [None]:
print(grid_search_cv.best_params_)

{'classifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=3,
                     weights='uniform'), 'classifier__n_neighbors': 6, 'classifier__p': 3, 'scaler': 'passthrough'}


##Validation Accuracy

In [38]:
print(grid_search_cv.best_score_)

0.9866666666666667
