In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint

# Load a sample dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = SVC()

# Define the hyperparameters and the distributions to sample from
param_distributions = {
    'C': uniform(0.1, 100),        # Uniform distribution between 0.1 and 100
    'gamma': uniform(0.001, 1),    # Uniform distribution between 0.001 and 1
    'kernel': ['linear', 'rbf', 'poly'],  # Fixed list of options
    'degree': randint(2, 5)        # Random integer between 2 and 4 for the polynomial kernel
}

# Set up the Randomized Search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=50, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)

# Perform the Randomized Search
random_search.fit(X_train, y_train)

# Find the best parameters
best_params_rs = random_search.best_params_
best_score_rs = random_search.best_score_

print(f"RandomizedSearchCV Best Parameters: {best_params_rs}")
print(f"RandomizedSearchCV Best Cross-Validation Score: {best_score_rs:.4f}")

# Evaluate the model on the test set
best_model_rs = random_search.best_estimator_
y_pred_rs = best_model_rs.predict(X_test)
test_score_rs = accuracy_score(y_test, y_pred_rs)

print(f"RandomizedSearchCV Test Set Accuracy: {test_score_rs:.4f}")


RandomizedSearchCV Best Parameters: {'C': 4.7450412719997725, 'degree': 4, 'gamma': 0.6813075385877797, 'kernel': 'linear'}
RandomizedSearchCV Best Cross-Validation Score: 0.9667
RandomizedSearchCV Test Set Accuracy: 0.9667


In [2]:
# Convert the results into a DataFrame
results_df = pd.DataFrame(random_search.cv_results_)

# # Display relevant columns
# results_df = results_df[['param_C', 'param_gamma', 'param_kernel', 'param_degree',
#                          'mean_test_score', 'std_test_score', 'rank_test_score']]

# # Sort the DataFrame by rank
# results_df = results_df.sort_values(by='rank_test_score')

# Show the DataFrame
display(results_df[['mean_score_time',
       'param_C', 'param_degree', 'param_gamma', 'param_kernel', 'mean_test_score',
       'std_test_score', 'rank_test_score']])

Unnamed: 0,mean_score_time,param_C,param_degree,param_gamma,param_kernel,mean_test_score,std_test_score,rank_test_score
0,0.007139,37.554012,2,0.184435,linear,0.95,0.061237,6
1,0.003006,59.785016,3,0.156995,poly,0.941667,0.056519,24
2,0.004574,46.024889,2,0.602115,poly,0.95,0.061237,6
3,0.002604,2.158449,3,0.722999,rbf,0.95,0.061237,6
4,0.005164,21.333911,2,0.618482,rbf,0.941667,0.056519,40
5,0.003866,52.575643,2,0.292229,poly,0.958333,0.06455,2
6,0.010254,40.086097,4,0.45707,poly,0.941667,0.062361,24
7,0.009406,61.938601,4,0.984231,linear,0.95,0.061237,6
8,0.002659,4.745041,4,0.681308,linear,0.966667,0.066667,1
9,0.004152,6.605159,2,0.966632,rbf,0.941667,0.056519,40


In [3]:
results_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'param_degree', 'param_gamma', 'param_kernel', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

In [4]:
results_df.shape

(50, 17)



### Explanation:

1. **param_distributions:**
   - Instead of specifying a fixed grid of hyperparameters like in `GridSearchCV`, we define a distribution or list from which random values will be sampled.
   - `uniform`: This represents a continuous uniform distribution. For example, `uniform(0.1, 100)` samples a random float between 0.1 and 100.
   - `randint`: This represents a discrete uniform distribution. For example, `randint(2, 5)` samples an integer between 2 and 4.

2. **n_iter:**
   - This parameter specifies the number of parameter settings that are sampled. For example, `n_iter=50` means that 50 random combinations will be evaluated.

3. **Random State:**
   - `random_state=42` ensures that the random sampling is reproducible.

4. **best_params_ and best_score_:**
   - `best_params_` gives the best set of hyperparameters found by the search.
   - `best_score_` is the best cross-validation score achieved by those hyperparameters.

5. **Test Set Evaluation:**
   - After finding the best model using `RandomizedSearchCV`, we evaluate its performance on the test set to ensure it generalizes well.

### When to Use GridSearchCV vs. RandomizedSearchCV:

- **GridSearchCV**: Use when you have a relatively small hyperparameter space and want to test all combinations exhaustively.
- **RandomizedSearchCV**: Use when the hyperparameter space is large, and you want to explore a broad range of values with a fixed computational budget.

### Advantages of RandomizedSearchCV:

- **Efficiency**: Faster when searching large hyperparameter spaces.
- **Flexibility**: Allows using distributions for hyperparameters, enabling the exploration of a wider range of values.

In practice, `RandomizedSearchCV` can often find good or even optimal hyperparameter settings with much less computational effort than `GridSearchCV`.