In [11]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils.fixes import loguniform

In [12]:
# load digits dataset
X, y = datasets.load_digits(return_X_y=True)
X.shape, y.shape

((1797, 64), (1797,))

In [13]:
# keep 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
# use kmeans as non linear dimensional reduction
model = Pipeline([
    ("kmeans", KMeans(n_clusters=10)),
    ("SVC", SVC(C=1.0, kernel="rbf"))
])

In [24]:
# distributions for hyperparameters
params = {
    "kmeans__n_clusters": randint(low=50, high=100),
    "SVC__C": loguniform(a=0.1, b=100),
}

In [25]:
# 10 random sampling of parameters, each tested through 5-fold cross-validation
search = RandomizedSearchCV(
    estimator=model, 
    param_distributions=params,
    n_iter=10,
    cv=5
)

In [26]:
CV = search.fit(X_train, y_train)

In [27]:
CV.best_params_

{'SVC__C': 47.05109438125532, 'kmeans__n_clusters': 61}

In [28]:
CV.best_score_

0.9881702477739063

In [29]:
# test final model on test dataset
selected_model = CV.best_estimator_
selected_model.score(X_test, y_test)

0.9777777777777777