### Use GridSearch CV and select the best parameters for Support Vector Machines

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv("C:/Users/Ashish/Desktop/Python Tutorials/CSV files/Advertising_data.txt")

In [5]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


## Based on Age and Estimated salary, we will predict if the customer will purchase the product or Not.

In [6]:
# We will get rid of unwanted columns i.e User Id and Gender. and we will seperate the dependent and independent features

X = data.iloc[:, [2,3]]
y = data.iloc[:, [4]]

In [7]:
X.head()

Unnamed: 0,Age,EstimatedSalary
0,19.0,19000.0
1,35.0,20000.0
2,26.0,43000.0
3,27.0,57000.0
4,19.0,76000.0


In [8]:
y.head()

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0


In [9]:
# Splitting the data into train and test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

In [10]:
# Feature Scaling (Age and salary are in different units and there is a huge different between them)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### fit_transform train data and transform test data

In [11]:
# Applying the Support Vector Machine on the data

from sklearn.svm import SVC
classifier = SVC(kernel = "linear", random_state = 9) 
classifier.fit(X_train, y_train)

  return f(**kwargs)


SVC(kernel='linear', random_state=9)

### We can also use a non linear line to seperate the 2 classes. We need to use kernel = "rbf"

In [12]:
# Predicting the Test Set Results
y_pred = classifier.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [14]:
print(cm)

[[52  2]
 [10 16]]


In [15]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [16]:
print(accuracy)

0.85


### In GridSearch CV, all the parameters need to be given in the form of List of Dictionaries

In [17]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

  return f(**kwargs)


In [18]:
accuracy = grid_search.best_score_

In [19]:
print(accuracy)

0.90625


In [20]:
grid_search.best_params_

{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}

In [21]:
classifier = SVC(kernel = 'rbf', gamma=0.7)
classifier.fit(X_train, y_train)

  return f(**kwargs)


SVC(gamma=0.7)

In [22]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [23]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [24]:
print(cm)

[[52  2]
 [ 3 23]]


In [25]:

from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)

In [26]:
print(accuracy)

0.9375


### After using the Best Parameters from GridSearch CV, we get accuracy as 93.75% !!

# RANDOMIZED SEARCH CV - FOR RANDOM FOREST

In [27]:
df = pd.read_csv("C:/Users/Ashish/Desktop/Python Tutorials/CSV files/Social_Network_Ads.csv")

In [28]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [59]:
# We will get rid of unwanted columns i.e User Id and Gender. and we will seperate the dependent and independent features

X = data.iloc[:, [2,3]]
y = data.iloc[:, 4]

In [60]:
# Splitting the data into train and test data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size = 0.25, random_state = 90)

In [61]:
#Scaling the independent features

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [62]:
# Using Random Forest with initial parameters

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10, criterion = "entropy", random_state = 3)
rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=3)

In [63]:
# Making prediction

y_pred = rf.predict(X_test)

In [64]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [65]:
print(score)

0.93


In [66]:
# using Randomized Search CV

from sklearn.model_selection import RandomizedSearchCV

In [67]:
from scipy.stats import randint

In [68]:
est = RandomForestClassifier()
rf_p_dist={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200,300,400,500],
              'max_features':randint(1,3),
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf':randint(1,4),
              }

In [69]:
random_search = RandomizedSearchCV(est, param_distributions= rf_p_dist, n_iter= 10, scoring = "roc_auc", n_jobs = -1, cv = 10, verbose=3)

In [70]:
# Create a timer to note the time taken to perform Randomized Search CV for XGb Classifier

def timer(start_time = None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print ("\n Time Taken: %i hours %i minutes %s seconds." %(thour, tmin, round(tsec,2)))

In [71]:
from datetime import datetime

start_time = timer(None) # timer start from this point
random_search.fit(X,y)
timer(start_time) # timer ends at this point

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.3s



 Time Taken: 0 hours 0 minutes 23.1 seconds.


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   22.9s finished


In [72]:
random_search.best_estimator_

RandomForestClassifier(max_depth=5, max_features=2, min_samples_leaf=3,
                       n_estimators=10)

In [73]:
random_search.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 2,
 'min_samples_leaf': 3,
 'n_estimators': 10}

In [74]:
random_search.best_score_

0.9507860805860805

In [75]:
# Creating a Model with Best Parameters

classifier = RandomForestClassifier(criterion='entropy', max_depth=5, max_features=2, min_samples_leaf=2, n_jobs=-1)

In [76]:
# Cross validation

from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X, y, cv = 10)

In [77]:
score

array([1.   , 0.75 , 0.925, 0.975, 1.   , 0.85 , 0.825, 0.775, 0.875,
       0.975])

In [78]:
score.mean()

0.8949999999999999

## Conclusion:

### If the number of Hyperparameters are less then go for GridsearchCV. If the number of Hyperparameters are more then go for Randomized Search CV.