In [7]:
# Load the data
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

In [6]:
# Load the digits dataset
X, y = load_digits(return_X_y=True)

In [8]:
# Exploratory data analysis
num_rows = X.shape[0]
num_features = X.shape[1]
class_counts = np.bincount(y)

In [9]:
print('The number of rows in the dataset is {:d}'.format(num_rows))
print('The number of features in the dataset is {:d}'.format(num_features))
print('Class distribution in the dataset:')
print(class_counts)

The number of rows in the dataset is 1797
The number of features in the dataset is 64
Class distribution in the dataset:
[178 182 177 183 181 182 181 179 174 180]


The dataset seems to have a balanced bin, meaning it is a <b>balanced dataset</b>.

In [12]:
# Prepare training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Cross-validation with logistic regression
lr_clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='ovr')
lr_cv_scores = cross_val_score(lr_clf, X_train, y_train, cv=5)

In [14]:
print('Accuracy scores for the 5 folds: ', lr_cv_scores)
print('Mean cross-validation score: {:.3f}'.format(np.mean(lr_cv_scores)))

Accuracy scores for the 5 folds:  [0.96527778 0.95486111 0.94773519 0.95121951 0.91637631]
Mean cross-validation score: 0.947


In [15]:
rf_clf = RandomForestClassifier(n_estimators=24)
rf_cv_scores = cross_val_score(rf_clf, X_train, y_train, cv=5)

In [16]:
print('Accuracy scores for the 5 folds: ', rf_cv_scores)
print('Mean cross-validation score: {:.3f}'.format(np.mean(rf_cv_scores)))

Accuracy scores for the 5 folds:  [0.97222222 0.95486111 0.94773519 0.97560976 0.96515679]
Mean cross-validation score: 0.963
