<a href="https://colab.research.google.com/github/abel-keya/machine_learning-_supervised_learning_with_python/blob/master/Random_Search_Hyperparameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python Programming: Random Search

## Example

In [0]:
# Loading libraries
import numpy as np
import pandas as pd
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
import seaborn as sb;

In [8]:
## Example 1
# ---
# Perform hyperparameter tuning then predict the quality of wine using Random Search. 
# ---
# Dataset url = http://bit.ly/TuningDataset
# ---
random=pd.read_csv("http://bit.ly/TuningDataset",sep=';') 
random

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [0]:
# Importing the required libraries
# ---
#
import pandas as pd
import numpy as np

In [0]:
# Importing our Dataset
# ---
#
dataset = pd.read_csv("http://bit.ly/TuningDataset", sep=';')

In [11]:
# Previewing our Dataset
# ---
#
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [0]:
# Performing Data Preprocessing
# ---
# 
X = dataset.iloc[:, 0:11].values
y = dataset.iloc[:, 11].values

In [16]:
X.shape,y.shape

((1599, 11), (1599,))

In [0]:
# Performing Data Preprocessing
# ---
# 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [0]:
# Scaling our Data
# ---
# 
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [0]:
# Training and Cross Validation
# ---
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300, random_state=0)

In [0]:
# Training and Cross Validation
# ---
# Next, to implement cross validation, the cross_val_score method 
# of the sklearn.model_selection library can be used. 
# The cross_val_score returns the accuracy for all the folds. 
# Values for 4 parameters are required to be passed to the cross_val_score class. 
# The first parameter is estimator which basically specifies 
# the algorithm that you want to use for cross validation. 
# The second and third parameters, X and y, contain the X_train and y_train data i.e. features and labels. 
# Finally the number of folds is passed to the cv parameter as shown in the following code
# ---
# 
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)

In [22]:
# Printing the accuracies returned for five folds 
# by the cross_val_score method by calling print on all_accuracies
# ---
#
print(all_accuracies.mean())

0.6747419804741981


In [0]:
# Step 1: Hyperparameters: Getting Started with Random Search
# ---
# Random search differs from grid search in that we longer 
# provide a discrete set of values to explore for each hyperparameter; rather, 
# we provide a statistical distribution for each hyperparameter 
# from which values may be randomly sampled.
# We'll define a sampling distribution for each hyperparameter.
# specify parameters and distributions to sample from
from scipy.stats import randint as sp_randint
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [0]:
# Step 2: Instantiating RandomizedSearchCV object 
# ---
# 
from sklearn.model_selection import RandomizedSearchCV 
random_sr = RandomizedSearchCV(classifier, param_dist, cv = 5) 

In [25]:
# Step 3: Calling the fit method
# ---
#
random_sr.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [26]:
# Step 4: Checking the parameters that return the highest accuracy
# ---
#
best_parameters = random_sr.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 2, 'min_samples_split': 3}


In [27]:
# Finding the obtained accuracy
# --
# 
best_result = random_sr.best_score_
print(best_result)

# Compare this with the 

0.6847524407252441


## <font color="green">Challenges</font>

In [39]:
## Challenge 1
# ---
# Question: Implement hyperparameter tuning using random search upon creating a model to classify 
# incomes of persons given the following dataset.
# ---
# Dataset url = http://bit.ly/HyperParameterTuningDataset
# ---
income=pd.read_csv("income.csv")
income

Unnamed: 0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [33]:
income.nunique()

39                 73
 State-gov          9
 Bachelors         16
13                 16
 Never-married      7
 Adm-clerical      15
 Not-in-family      6
 White              5
 Male               2
2174              119
0                  92
40                 94
 United-States     42
 <=50K              2
dtype: int64

In [34]:
income.dtypes

39                 int64
 State-gov        object
 Bachelors        object
13                 int64
 Never-married    object
 Adm-clerical     object
 Not-in-family    object
 White            object
 Male             object
2174               int64
0                  int64
40                 int64
 United-States    object
 <=50K            object
dtype: object

In [35]:
income.columns

Index(['39', ' State-gov', ' Bachelors', '13', ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', '2174', '0', '40',
       ' United-States', ' <=50K'],
      dtype='object')

In [40]:
income=income.drop(['39', '13','2174', '0', '40'],axis=1)
income

Unnamed: 0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
1,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
2,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
3,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
4,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K
...,...,...,...,...,...,...,...,...,...
32555,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32556,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32557,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32558,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


In [44]:
#create dummy variables for' Male',' <=50K'
gender = pd.get_dummies(income['Male'],drop_first=True)
#previewing the top 
gender.head()

KeyError: ignored

In [41]:
income.columns

Index([' State-gov', ' Bachelors', ' Never-married', ' Adm-clerical',
       ' Not-in-family', ' White', ' Male', ' United-States', ' <=50K'],
      dtype='object')

In [0]:
#'customerID','Contract','PaymentMethod'
# Importing label encoder
# 
from sklearn.preprocessing import LabelEncoder
categorical_features = [' State-gov', ' Bachelors', ' Never-married', ' Adm-clerical',' Not-in-family', ' White',  ' United-States']
le = LabelEncoder()
 
# Converting the variables to numerical
#
for i in range(3):
    new = le.fit_transform(churn_data3[categorical_features[i]])
    churn_data3[categorical_features[i]] = new
churn_data3.head()

In [0]:
## Challenge 2
# ---
# Perform hyperparameter tuning by applying Random search to the challenges that you worked on during Week 8.
# ---  