In [3]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import pickle
from sklearn import datasets
from sklearn.model_selection import train_test_split

#Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

#### Data Gathering

In [4]:
data = datasets.load_iris()
X = data.data
y = data.target

In [5]:
X_df = pd.DataFrame(X, columns=data.feature_names)
y_df = pd.DataFrame(data.target_names[y], columns=['Type'])

In [6]:
df = pd.concat([X_df, y_df], axis = 1)

#### Splitting into Training and Testing Data 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 7, stratify = y)

#### Checking for Loaded Model

In [8]:
# load the model from disk
#Uncomment for preloaded model 

# filename = 'pipe.sav'
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, y_test)
# print(result)

In [9]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Type
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [10]:
df[df.Type == "versicolor"][:5]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Type
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [11]:
df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Type
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


#### Finding optimum path for the classification model

In [19]:
#Importing Scaling, Dimensionality Reduction functions and classifiers

from sklearn import decomposition, pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix


In [20]:
#making instance of all objects 
ss = StandardScaler()
lda = LinearDiscriminantAnalysis()
pca = decomposition.PCA()
kpca = decomposition.KernelPCA()
svd = decomposition.TruncatedSVD()
lr = LogisticRegression()
svm = SVC()
n_components = [2,3]

#### (a) With Scaling

In [21]:
#Making pipeline
pipe = pipeline.Pipeline(steps = [('scalar', ss),('reduction', pca), ('classify', lr)])

##### Defining params for Dimesionality Reduction

In [23]:
param_pca = {
    'reduction':[pca],
    'reduction__n_components': n_components,
    'reduction__svd_solver': ['randomized','auto', 'full']
}

param_lda = {
    'reduction':[lda], 
    'reduction__n_components':[2,3]
}

param_kpca = {
    'reduction':[kpca],
    'reduction__n_components': [2,3],
    'reduction__kernel': ['poly','rbf']
}


##### Defining Params for Classification Model

In [24]:
param_lr = {
    'classify':[lr],
    'classify__C':[0.1,1,10], 
}

param_svm = {
    'classify':[svm],
    'classify__C':[0.1,1,10,100]
}


In [25]:
dimen_params = [param_pca, param_lda, param_kpca]
class_params = [param_lr, param_svm]
params_list = [{**x, **y} for x in dimen_params for y in class_params]

In [26]:
grd = GridSearchCV(pipe, param_grid=params_list, cv = 5, verbose = 1)
grd.fit(X_train, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scalar', StandardScaler()),
                                       ('reduction',
                                        PCA(n_components=3,
                                            svd_solver='randomized')),
                                       ('classify', LogisticRegression(C=10))]),
             param_grid=[{'classify': [LogisticRegression(C=10)],
                          'classify__C': [0.1, 1, 10],
                          'reduction': [PCA(n_components=3,
                                            svd_solver='randomized')],
                          'reduction__n_components': [2, 3],
                          'reduction__sv...
                          'reduction': [LinearDiscriminantAnalysis()],
                          'reduction__n_components': [2, 3]},
                         {'classify': [LogisticRegression(C=10)],
                          'classify__C': [0.1, 1, 10],
                          'reduction

In [27]:
grd.best_params_

{'classify': LogisticRegression(C=10),
 'classify__C': 10,
 'reduction': PCA(n_components=3, svd_solver='randomized'),
 'reduction__n_components': 3,
 'reduction__svd_solver': 'randomized'}

In [28]:
grd.best_score_

0.975

In [29]:
print(classification_report(y_test, grd.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [30]:
confusion_matrix(y_test,  grd.predict(X_test))

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 10]], dtype=int64)

#### Making the Model

In [31]:
svm = SVC(C = 0.1)
lda = LinearDiscriminantAnalysis(n_components=2)
ss = StandardScaler()

In [32]:
#Making pipeline
pipe = pipeline.Pipeline(steps = [('scalar', ss),('reduction', lda), ('classify', svm)])

In [33]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scalar', StandardScaler()),
                ('reduction', LinearDiscriminantAnalysis(n_components=2)),
                ('classify', SVC(C=0.1))])

In [34]:
pipe.score(X_train, y_train)

0.975

In [35]:
# save the model to disk
filename = 'pipe.sav'
pickle.dump(pipe, open(filename, 'wb'))


### For Util.py

In [36]:
import pandas as pd
import numpy as np


In [37]:
data = {
    'sepal_length': '5.1',
    'sepal_width': '3.5',
    'petal_length': '1.4',
    'petal_width': '0.2'
}

In [38]:
np.array([[float(data['sepal_length']), float(data['sepal_width']), float(data['petal_length']), float(data['petal_width'])]]).shape

(1, 4)

In [39]:
import pickle

def predict(data):
    X = np.array([[
        float(data['sepal_length']),
        float(data['sepal_width']),
        float(data['petal_length']),
        float(data['petal_width'])
    ]])
    # load the model from disk

    filename = 'pipe.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    y_pred = loaded_model.predict(X)

    names = np.array(['setosa', 'versicolor', 'virginica'])

    return names[y_pred[0]]

In [40]:
x = predict(data)