## 超參數調參 武功秘笈

- Gridsearch
- RandomGridSearch


https://www.kaggle.com/datasets/iabhishekofficial/mobile-price-classification


In [2]:
import pandas as pd
import numpy as np


from sklearn import metrics
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


## load dataset

In [3]:
df = pd.read_csv("./mobile_price_data/train.csv")
X = df.drop("price_range", axis=1).values
y = df["price_range"].values


# Gridsearch

- 所有的調參過程都會在 k fold 中進行完畢

In [3]:
classifier = RandomForestClassifier(n_jobs=-1)

# Define the grid of parameters
param_grid = {
    "n_estimators": np.arange(100, 1000, 200),
    "criterion": ["gini", "entropy"],
    "max_depth": np.arange(1, 20, 2)
}

# Randomized search for hyperparameter optimization
model = model_selection.GridSearchCV(
    estimator=classifier,
    param_grid=param_grid,
    scoring="accuracy",
    verbose=0,
    n_jobs=1,
    cv=5
)

model.fit(X, y)

print("Best score: ", model.best_score_)
print("\n")
print("Best estimator: ", model.best_estimator_)
print("\n")
print("Best parameters: ", model.best_estimator_.get_params())
print("\n")
print("Best results: ", model.cv_results_)



Best score:  0.8905000000000001


Best estimator:  RandomForestClassifier(criterion='entropy', max_depth=13, n_estimators=300,
                       n_jobs=-1)


Best parameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 13, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 300, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Best results:  {'mean_fit_time': array([0.07428293, 0.19860992, 0.32717023, 0.44811435, 0.5755271 ,
       0.07203279, 0.20195169, 0.32212343, 0.45990591, 0.57876601,
       0.07289543, 0.20255094, 0.32778492, 0.45078311, 0.58286085,
       0.07675257, 0.20137773, 0.32923684, 0.458673  , 0.59851489,
       0.07774601, 0.20438209, 0.3338706 , 0.46279449, 0.59445734,
       0.07812047, 0.20391335, 0.3

## RandomizedSearchCV + pipeline

In [4]:
classifier = Pipeline(
    [ 
        ("scaling", StandardScaler()), # scaling the data
        ("pca", PCA()), # PCA
        ("rf", RandomForestClassifier(n_jobs=-1)) # Random Forest
     ]
)

In [5]:
# Define the grid of parameters
param_grid = {
    "pca__n_components": np.arange(5, 10),
    "rf__n_estimators": np.arange(100, 1500, 200),
    "rf__criterion": ["gini", "entropy"],
    "rf__max_depth": np.arange(1, 20 , 2)
}

# Randomized search for hyperparameter optimization
model = model_selection.RandomizedSearchCV(
    estimator=classifier,
    param_distributions=param_grid,
    n_iter=5, # Number of iterations to try
    scoring="accuracy",
    verbose=5,
    n_jobs=1,
    cv=5
)

# Fit the model
model.fit(X, y)



Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END pca__n_components=9, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=300;, score=0.398 total time=   0.2s
[CV 2/5] END pca__n_components=9, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=300;, score=0.460 total time=   0.2s
[CV 3/5] END pca__n_components=9, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=300;, score=0.472 total time=   0.2s
[CV 4/5] END pca__n_components=9, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=300;, score=0.515 total time=   0.2s
[CV 5/5] END pca__n_components=9, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=300;, score=0.410 total time=   0.2s
[CV 1/5] END pca__n_components=5, rf__criterion=entropy, rf__max_depth=3, rf__n_estimators=900;, score=0.352 total time=   0.6s
[CV 2/5] END pca__n_components=5, rf__criterion=entropy, rf__max_depth=3, rf__n_estimators=900;, score=0.335 total time=   0.6s
[CV 3/5] END pca__n_components=5, rf__criterion=entrop

In [6]:
print("Best score: ", model.best_score_)
print("\n")
print("Best estimator: ", model.best_estimator_)
print("\n")
print("Best parameters: ", model.best_estimator_.get_params())
print("\n")
print("Best results: ", model.cv_results_)

Best score:  0.462


Best estimator:  Pipeline(steps=[('scaling', StandardScaler()), ('pca', PCA(n_components=9)),
                ('rf',
                 RandomForestClassifier(max_depth=5, n_estimators=1100,
                                        n_jobs=-1))])


Best parameters:  {'memory': None, 'steps': [('scaling', StandardScaler()), ('pca', PCA(n_components=9)), ('rf', RandomForestClassifier(max_depth=5, n_estimators=1100, n_jobs=-1))], 'verbose': False, 'scaling': StandardScaler(), 'pca': PCA(n_components=9), 'rf': RandomForestClassifier(max_depth=5, n_estimators=1100, n_jobs=-1), 'scaling__copy': True, 'scaling__with_mean': True, 'scaling__with_std': True, 'pca__copy': True, 'pca__iterated_power': 'auto', 'pca__n_components': 9, 'pca__n_oversamples': 10, 'pca__power_iteration_normalizer': 'auto', 'pca__random_state': None, 'pca__svd_solver': 'auto', 'pca__tol': 0.0, 'pca__whiten': False, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'g

## Making the Confusion Matrix

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = model.predict(X)
cm = confusion_matrix(y, y_pred)
print(cm)
accuracy_score(y, y_pred)

[[380  43  31  46]
 [155 182  53 110]
 [106  50 195 149]
 [ 27  43  35 395]]


0.576