# Randomized Search CV

* Benefits: faster than Grid Search CV, appropriate to use when there are many features to tune

In [44]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
import warnings
warnings.simplefilter("ignore", category=PendingDeprecationWarning)

In [9]:
SEED = 42

In [30]:
iris = load_iris()

In [31]:
X_train, X_test, y_train, y_test = train_test_split(iris["data"], 
                                                    iris["target"], 
                                                    test_size=0.2,
                                                    random_state=SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

#### Train Base Model

In [20]:
rf = RandomForestClassifier(n_jobs=-1,
                            random_state=SEED)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [22]:
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="accuracy")
scores.mean(), scores.std()

(0.9248834498834497, 0.1146157810557605)

#### Scale Features

In [25]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [34]:
rf = RandomForestClassifier(n_jobs=-1,
                            random_state=SEED)
rf.fit(X_train, y_train)

scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="accuracy")
scores.mean(), scores.std()



(0.9248834498834497, 0.1146157810557605)

#### Optimize Hyperparameters

In [41]:
model = RandomForestClassifier(n_jobs=-1)

params = {
    "max_depth": [3, 5, 10, None],
    "n_estimators": [100, 200, 300, 400, 500],
    "max_features": randint(1, 3),
    "criterion": ["gini", "entropy"],
    "bootstrap": [True, False],
    "min_samples_leaf": randint(1, 4)
}

rf_optimal = RandomizedSearchCV(model,
                            param_distributions=params,
                            n_jobs=-1,
                            n_iter=50,
                            cv=4)

In [42]:
rf_optimal.fit(X_train, y_train)
scores = cross_val_score(rf_optimal, X_train, y_train, cv=10, scoring="accuracy")
scores.mean(), scores.std()



(0.9332167832167831, 0.10414918675773642)

In [43]:
rf_optimal.best_params_, rf_optimal.best_score_

({'bootstrap': True,
  'criterion': 'gini',
  'max_depth': 3,
  'max_features': 2,
  'min_samples_leaf': 2,
  'n_estimators': 200},
 0.95)