# CLustering for Preprocessing

In [23]:
#importing the libraries
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [21]:
# Load the data
X_digits, y_digits = load_digits(return_X_y=True)

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

# Train a logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model
model.fit(X_train, y_train)

# Evaluate the model
model.score(X_test, y_test)

0.9755555555555555

## Pipeline

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

# Create a pipeline
pipeline = Pipeline([
    ('kmeans', KMeans(n_clusters=50)),
    ("logistic", LogisticRegression(random_state=42, max_iter=1000))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Evaluate the model
pipeline.score(X_test, y_test)

0.98

## Cross Validation

In [30]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(kmeans__n_clusters=range(2, 10))
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train, y_train)


grid_clf.best_params_

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END ...............................kmeans__n_clusters=2; total time=   0.9s
[CV] END ...............................kmeans__n_clusters=2; total time=   0.6s
[CV] END ...............................kmeans__n_clusters=2; total time=   0.6s
[CV] END ...............................kmeans__n_clusters=3; total time=   0.9s
[CV] END ...............................kmeans__n_clusters=3; total time=   0.7s
[CV] END ...............................kmeans__n_clusters=3; total time=   0.6s
[CV] END ...............................kmeans__n_clusters=4; total time=   0.5s
[CV] END ...............................kmeans__n_clusters=4; total time=   0.6s
[CV] END ...............................kmeans__n_clusters=4; total time=   0.7s
[CV] END ...............................kmeans__n_clusters=5; total time=   0.9s
[CV] END ...............................kmeans__n_clusters=5; total time=   0.7s
[CV] END ...............................kmeans__n

{'kmeans__n_clusters': 9}

In [32]:
grid_clf.score(X_test, y_test)

0.9266666666666666