In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import pickle
from sklearn import datasets
from sklearn.model_selection import train_test_split

#### Data Gathering

In [2]:
data = datasets.load_iris()
X = data.data
y = data.target

In [3]:
X_df = pd.DataFrame(X, columns=data.feature_names)
y_df = pd.DataFrame(data.target_names[y], columns=['Type'])

In [4]:
df = pd.concat([X_df, y_df], axis = 1)

#### Splitting into Training and Testing Data 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 7, stratify = y)

#### Checking for Loaded Model

In [18]:
# load the model from disk

filename = 'pipe.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

1.0


In [6]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Type
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
df[df.Type == "versicolor"][:5]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Type
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [8]:
df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Type
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [8]:
X = np.array([[5.1,3.5,1.4,0.2]])

In [9]:
loaded_model.predict(X)

array([0])

#### Finding optimum path for the classification model

In [10]:
#Importing 

from sklearn import decomposition, pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix


In [11]:
#making instance of all objects 
ss = StandardScaler()
lda = LinearDiscriminantAnalysis()
pca = decomposition.PCA()
kpca = decomposition.KernelPCA()
svd = decomposition.TruncatedSVD()
lr = LogisticRegression()
svm = SVC()
n_components = [2,3]

#### (a) With Scaling

In [62]:
#Making pipeline
pipe = pipeline.Pipeline(steps = [('scalar', ss),('reduction', pca), ('classify', lr)])

##### Defining params for Dimesionality Reduction

In [63]:
param_pca = {
    'reduction':[pca],
    'reduction__n_components': n_components,
    'reduction__svd_solver': ['randomized','auto', 'full']
}

param_lda = {
    'reduction':[lda], 
    'reduction__n_components':[2,3]
}

param_kpca = {
    'reduction':[kpca],
    'reduction__n_components': [2,3],
    'reduction__kernel': ['poly','rbf']
}


##### Defining Params for Classification Model

In [64]:
param_lr = {
    'classify':[lr],
    'classify__C':[0.1,1,10], 
}

param_svm = {
    'classify':[svm],
    'classify__C':[0.1,1,10,100]
}


In [65]:
dimen_params = [param_pca, param_lda, param_kpca]
class_params = [param_lr, param_svm]
params_list = [{**x, **y} for x in dimen_params for y in class_params]

In [66]:
grd = GridSearchCV(pipe, param_grid=params_list, cv = 5, verbose = 1)
grd.fit(X_train, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


















[Parallel(n_jobs=1)]: Done 420 out of 420 | elapsed:    1.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scalar', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reduction', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)), ('classify', LogisticRegression(C=10, class_weight=No...enalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'reduction': [PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)], 'reduction__n_components': [2, 3], 'reduction__svd_solver': ['randomized', 'auto', 'full'], 'classify': [LogisticRegression(C=10, class_weight=None, dual=... random_state=None,
  shrinking=True, tol=0.001, verbose=False)], 'classify__C': [0.1, 1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
    

In [67]:
grd.best_params_

{'classify': SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
   kernel='rbf', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False),
 'classify__C': 0.1,
 'reduction': LinearDiscriminantAnalysis(n_components=2, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
 'reduction__n_components': 2}

In [68]:
grd.best_score_

0.975

In [69]:
print(classification_report(y_test, grd.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

   micro avg       1.00      1.00      1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [70]:
confusion_matrix(y_test,  grd.predict(X_test))

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 10]], dtype=int64)

#### Making the Model

In [73]:
svm = SVC(C = 0.1)
lda = LinearDiscriminantAnalysis(n_components=2)
ss = StandardScaler()

In [74]:
#Making pipeline
pipe = pipeline.Pipeline(steps = [('scalar', ss),('reduction', lda), ('classify', svm)])

In [75]:
pipe.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('scalar', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reduction', LinearDiscriminantAnalysis(n_components=2, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)), ('classify', SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [77]:
pipe.score(X_train, y_train)

0.975

In [80]:
# save the model to disk
filename = 'pipe.sav'
pickle.dump(pipe, open(filename, 'wb'))


### For Util.py

In [1]:
import pandas as pd
import numpy as np


In [20]:
data = {
    'sepal_length': '5.1',
    'sepal_width': '3.5',
    'petal_length': '1.4',
    'petal_width': '0.2'
}

In [13]:
np.array([[float(data['sepal_length']), float(data['sepal_width']), float(data['petal_length']), float(data['petal_width'])]]).shape

(1, 4)

In [21]:
import pickle

def predict(data):
    X = np.array([[
        float(data['sepal_length']),
        float(data['sepal_width']),
        float(data['petal_length']),
        float(data['petal_width'])
    ]])
    # load the model from disk

    filename = 'pipe.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    y_pred = loaded_model.predict(X)

    names = np.array(['setosa', 'versicolor', 'virginica'])

    return names[y_pred[0]]

In [22]:
x = predict(data)

In [23]:
x

'setosa'

In [17]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')