# Load Data

In [1]:
from sklearn.datasets import load_digits

X_digits, y_digits = load_digits(return_X_y=True)

# Split into Train and Test

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)

# Train Model - Logistical Regression

In [3]:
from sklearn.linear_model import LogisticRegression

#OVR = one versus the rest is best since we have a binary problem compared to multiclass. 
log_reg = LogisticRegression(random_state=42, verbose=0, multi_class="ovr",solver="lbfgs")
log_reg.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Evaluate Accuracy

In [4]:
log_reg.score(X_test, y_test)

0.9644444444444444

# Now Compares this to a model that has k-means clustering

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

log_pipeline = Pipeline([
    ('kmeans', KMeans(n_clusters=50, n_jobs=-1, random_state=42)),
    ('log_reg', LogisticRegression(random_state=42,solver="lbfgs", verbose=0, multi_class="ovr", max_iter=5000))
])

log_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('kmeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=50, n_init=10, n_jobs=-1,
                        precompute_distances='auto', random_state=42,
                        tol=0.0001, verbose=0)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=5000,
                                    multi_class='ovr', n_jobs=None,
                                    penalty='l2', random_state=42,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [6]:
log_pipeline.score(X_test, y_test)

0.98

##### 96.4% to 98% ! Reduced the error rate from 3.6% to 2%. Just by using PCA, without optimising the of k. So let's try that with grid search
(FYI, another way to think about it is, that we reduced the error rate by 45%.

# Fine Tune Model

In [7]:
from sklearn.model_selection import GridSearchCV

#Search for all values for the best k value - from 2 to 100 first. 
#double underscore implies that we only want this parameter to be set with that specific class.
params_grid = dict(kmeans__n_clusters= range(50, 105))

grid_clf = GridSearchCV(log_pipeline, params_grid, cv=3, verbose=2, return_train_score= 'warn')

grid_clf.fit(X_train, y_train)


Fitting 3 folds for each of 55 candidates, totalling 165 fits
[CV] kmeans__n_clusters=50 ...........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................ kmeans__n_clusters=50, total=   8.7s
[CV] kmeans__n_clusters=50 ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.6s remaining:    0.0s


[CV] ............................ kmeans__n_clusters=50, total=   9.5s
[CV] kmeans__n_clusters=50 ...........................................
[CV] ............................ kmeans__n_clusters=50, total=   9.4s
[CV] kmeans__n_clusters=51 ...........................................
[CV] ............................ kmeans__n_clusters=51, total=   7.3s
[CV] kmeans__n_clusters=51 ...........................................
[CV] ............................ kmeans__n_clusters=51, total=   8.4s
[CV] kmeans__n_clusters=51 ...........................................
[CV] ............................ kmeans__n_clusters=51, total=   9.2s
[CV] kmeans__n_clusters=52 ...........................................
[CV] ............................ kmeans__n_clusters=52, total=   9.0s
[CV] kmeans__n_clusters=52 ...........................................
[CV] ............................ kmeans__n_clusters=52, total=   7.7s
[CV] kmeans__n_clusters=52 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=69, total=  10.2s
[CV] kmeans__n_clusters=70 ...........................................
[CV] ............................ kmeans__n_clusters=70, total=   8.6s
[CV] kmeans__n_clusters=70 ...........................................
[CV] ............................ kmeans__n_clusters=70, total=   9.3s
[CV] kmeans__n_clusters=70 ...........................................
[CV] ............................ kmeans__n_clusters=70, total=   9.6s
[CV] kmeans__n_clusters=71 ...........................................
[CV] ............................ kmeans__n_clusters=71, total=   8.7s
[CV] kmeans__n_clusters=71 ...........................................
[CV] ............................ kmeans__n_clusters=71, total=  10.8s
[CV] kmeans__n_clusters=71 ...........................................
[CV] ............................ kmeans__n_clusters=71, total=  10.5s
[CV] kmeans__n_clusters=72 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=89, total=  10.6s
[CV] kmeans__n_clusters=89 ...........................................
[CV] ............................ kmeans__n_clusters=89, total=  10.8s
[CV] kmeans__n_clusters=89 ...........................................
[CV] ............................ kmeans__n_clusters=89, total=   9.7s
[CV] kmeans__n_clusters=90 ...........................................
[CV] ............................ kmeans__n_clusters=90, total=   9.4s
[CV] kmeans__n_clusters=90 ...........................................
[CV] ............................ kmeans__n_clusters=90, total=  10.6s
[CV] kmeans__n_clusters=90 ...........................................
[CV] ............................ kmeans__n_clusters=90, total=   8.1s
[CV] kmeans__n_clusters=91 ...........................................
[CV] ............................ kmeans__n_clusters=91, total=  10.2s
[CV] kmeans__n_clusters=91 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done 165 out of 165 | elapsed: 25.6min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('kmeans',
                                        KMeans(algorithm='auto', copy_x=True,
                                               init='k-means++', max_iter=300,
                                               n_clusters=50, n_init=10,
                                               n_jobs=-1,
                                               precompute_distances='auto',
                                               random_state=42, tol=0.0001,
                                               verbose=0)),
                                       ('log_reg',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                        

In [8]:
grid_clf.best_params_

{'kmeans__n_clusters': 57}

In [9]:
grid_clf.score(X_test, y_test)

0.98

### By using GridSearchCV we were not able to increase the score but 98% is good score to have. The next thing we can try to experiement with are Support Vector Machine Classifiers. But I am happy with 98% overall. 