In [82]:
# Importing Models
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns


In [2]:
# Loading Dataset
iris = load_iris()

In [3]:
# Creating features and class
x = iris.data
y = iris.target

In [37]:
# Instantiating ML algorithms


In [32]:
# Creating Models dictionary
models = {'SVM':SVC(), 'logreg': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 
          'Naive-Bayes Gauss': GaussianNB(), 'Naive-Bayes Multi': MultinomialNB(), 'Decision tree': DecisionTreeClassifier(),
         'Knn': KNeighborsClassifier(n_neighbors=20), 'GradientBoost': GradientBoostingClassifier(),
          'AdaBoost':AdaBoostClassifier(base_estimator= DecisionTreeClassifier())}

In [33]:
# Scoring
for key in models:
    score = cross_val_score(models[key], x, y, cv=13, scoring='accuracy').mean()
    print('{} scored: {:.1f}'.format(key, score*100))


AdaBoost scored: 95.5
Naive-Bayes Gauss scored: 95.5
GradientBoost scored: 96.2
Naive-Bayes Multi scored: 95.5
Decision tree scored: 96.2
logreg scored: 96.2
SVM scored: 97.4
RandomForestClassifier scored: 96.2
Knn scored: 97.4


In [11]:
# Taking SVM ahead and improving it

In [12]:
# Finding optimized paramters: Defining the grid to be searched

param_grid = dict(kernel = ['linear', 'rbf', 'poly', 'sigmoid'], decision_function_shape=['ovr', 'ovo'])
svm_model = SVC()

In [13]:
# Initializing the grid
grid = GridSearchCV(svm_model, param_grid, cv =10, scoring='accuracy', n_jobs=-1)

In [14]:
# Fitting the grid with data
grid.fit(x, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'decision_function_shape': ['ovr', 'ovo']},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

### If param_grid has invalid parameters for model then following error occurs. Always make sure paramters and their values exist for a given model 


File "<string>", line unknown
SyntaxError: invalid or missing encoding declaration for C:\tree\\svm\\libsvm.pyd

In [15]:
# Getting best estimator settings
grid.best_estimator_

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
# Optimized parameters from GridSearchCV
grid.best_params_

{'decision_function_shape': 'ovr', 'kernel': 'rbf'}

In [17]:
# Best score obtained while searching the grid
grid.best_score_

0.98

In [18]:
grid.grid_scores_

[mean: 0.97333, std: 0.04422, params: {'kernel': 'linear', 'decision_function_shape': 'ovr'},
 mean: 0.98000, std: 0.03055, params: {'kernel': 'rbf', 'decision_function_shape': 'ovr'},
 mean: 0.97333, std: 0.04422, params: {'kernel': 'poly', 'decision_function_shape': 'ovr'},
 mean: 0.06000, std: 0.10088, params: {'kernel': 'sigmoid', 'decision_function_shape': 'ovr'},
 mean: 0.97333, std: 0.04422, params: {'kernel': 'linear', 'decision_function_shape': 'ovo'},
 mean: 0.98000, std: 0.03055, params: {'kernel': 'rbf', 'decision_function_shape': 'ovo'},
 mean: 0.97333, std: 0.04422, params: {'kernel': 'poly', 'decision_function_shape': 'ovo'},
 mean: 0.06000, std: 0.10088, params: {'kernel': 'sigmoid', 'decision_function_shape': 'ovo'}]

In [19]:
# Initializing SVM model with optimized parameter values
svmc = SVC(decision_function_shape='ovr', kernel='rbf',)
# Ovr is used because it is a multi class problem.

In [57]:
# Finding new score
new_scoresvm = cross_val_score(svmc, x, y, cv = 10, scoring='accuracy').mean()
print(new_scoresvm)

0.98


In [58]:
# Percentage improvement using optimized parameters
print('Improvement by {:0.2}% '.format(((new_scoresvm-0.974)/0.974)*100))

Improvement by 0.62% 


## Accuracy of the optimized SVC model is 98%

In [41]:
# Taking KNN and trying to improve the accuracy
knn = KNeighborsClassifier()

cross_val_score(knn, x, y, cv=13, scoring='accuracy').mean()

0.96794871794871784

In [40]:
list(range(5,26))

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]

In [45]:
# Defining parameter grid for the search
param_gridKNN = dict(n_neighbors = list(range(5,26)), weights = ['uniform', 'distance'], 
                     algorithm =['auto', 'ball_tree', 'kd_tree', 'brute'],
                    leaf_size = [10,20,30,40,50])

# Initializing the grid
gridknn = GridSearchCV(knn, param_gridKNN, cv =10, scoring='accuracy', n_jobs=-1)


In [46]:
# Fitting the grid with data
gridknn.fit(x, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': [10, 20, 30, 40, 50], 'n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [47]:
# Getting the best estimator
gridknn.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

In [48]:
# Getting the best parameters
gridknn.best_params_

{'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 13, 'weights': 'uniform'}

In [49]:
# Getting the best score
gridknn.best_score_

0.98

In [50]:
# Initializing the KNN model with new Hyperparamters
knn = KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

In [52]:
# New score from optimized knn
new_scoreknn = cross_val_score(knn, x, y, cv = 10, scoring='accuracy').mean()
new_scoreknn

0.98000000000000009

In [54]:
# Percentage improvement using optimized parameters
print('Improvement by {:0.2}% '.format(((new_scoreknn-0.974)/0.974)*100))

Improvement by 0.62% 


## Accuracy of the optimized Knn model is 98%

In [60]:
# Tuning GradientBoost
gb = GradientBoostingClassifier()
cross_val_score(gb, x, y, cv=13, scoring='accuracy').mean()

0.96153846153846156

In [77]:
# Setting up the param_grid
param_gridgb = dict(n_estimators = [50,100,150,200,250,300], max_depth = [2,3,4,5,6], learning_rate = [0.001, 0.01,0.1,1])

# Initializing the grid
gridgb = GridSearchCV(gb, param_gridgb, cv =10, scoring='accuracy', n_jobs=-1)



In [78]:
# fitting the grid
gridgb.fit(x,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 6], 'n_estimators': [50, 100, 150, 200, 250, 300], 'learning_rate': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [79]:
gridgb.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=150, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [80]:
gridgb.best_params_

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 150}

In [84]:
new_scoregb = gridgb.best_score_

## Accuracy of the optimized GradientBoosting model is 96.6%