## Example of ML on Images:  Classifying Handwritten Digits

This extends the logistic regression example, and we'll abbreviate some of the code/cells at the beginning.

In [None]:
import matplotlib.pyplot as plt
import sklearn.datasets
import sklearn.model_selection
import sklearn.metrics
import sklearn.preprocessing

In [None]:
d = sklearn.datasets.load_digits()

x = d.data
y = d.target

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, test_size=0.2, random_state=42, stratify=y)

scaler = sklearn.preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Logistic Regression

In [None]:
import sklearn.linear_model
lr_classifier = sklearn.linear_model.LogisticRegression()

In [None]:
lr_classifier.fit(x_scaled, y_train)

In [None]:
y_pred = lr_classifier.predict(x_test_scaled)

In [None]:
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, y_pred):.2%}")

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
cm

# Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()

In [None]:
tree_clf.fit(x_train, y_train)

In [None]:
tree_clf.classes_

In [None]:
y_pred = tree_clf.predict(x_test)

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
cm

In [None]:
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, y_pred):.2%}")

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(x_train, y_train)

In [None]:
rf_clf.classes_

In [None]:
y_pred = rf_clf.predict(x_test)

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
cm

In [None]:
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, lr_classifier.predict(x_test_scaled)):.2%}")
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, tree_clf.predict(x_test)):.2%}")
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, rf_clf.predict(x_test)):.2%}")

Can we improve the Random Forest accuracy?

Actually, what parameters does it currently have?

In [None]:
rf_clf.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
cv_grid = GridSearchCV(RandomForestClassifier(n_jobs=-1,random_state=42),
                       param_grid = {
                           'max_depth' : [None,10,20],
                           'n_estimators' : [50,100,200],
                           'max_leaf_nodes' : [None,5,10]
                       })
cv_grid.fit(x_train, y_train)
cv_grid.best_params_

In [None]:
y_pred = cv_grid.predict(x_test)

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
cm

In [None]:
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, y_pred):.2%}")

In [None]:
rf_clf.feature_importances_

In [None]:
plt.imshow(rf_clf.feature_importances_.reshape(8,8),
           cmap='binary')