In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('mnist_small.csv', sep=',', header=None)

In [3]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42, stratify=y)

In [5]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
svm_clf = SVC(kernel='poly', gamma=0.1, C=0.1)
lr_clf = SGDClassifier(loss='log')
rf_clf = RandomForestClassifier()

In [7]:
res = cross_val_score(lr_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(f"Average Accuracy: \t {np.mean(res):.4}")
print(f"Accuracy SD: \t\t {np.std(res):.4}")

Average Accuracy: 	 0.8982
Accuracy SD: 		 0.004238


In [8]:
y_train_pred = cross_val_predict(lr_clf, X_train, y_train, cv=5, n_jobs=-1)

In [9]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1400
           1       0.96      0.95      0.96      1400
           2       0.89      0.86      0.87      1400
           3       0.90      0.85      0.87      1400
           4       0.93      0.91      0.92      1400
           5       0.88      0.85      0.86      1400
           6       0.94      0.93      0.94      1400
           7       0.93      0.90      0.92      1400
           8       0.76      0.92      0.83      1400
           9       0.87      0.87      0.87      1400

    accuracy                           0.90     14000
   macro avg       0.90      0.90      0.90     14000
weighted avg       0.90      0.90      0.90     14000



In [10]:
res = cross_val_score(rf_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(f"Average Accuracy: \t {np.mean(res):.4}")
print(f"Accuracy SD: \t\t {np.std(res):.4}")

Average Accuracy: 	 0.9106
Accuracy SD: 		 0.002539


In [11]:
y_train_pred = cross_val_predict(rf_clf, X_train, y_train, cv=5, n_jobs=-1)

In [12]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1400
           1       0.94      0.98      0.96      1400
           2       0.87      0.90      0.88      1400
           3       0.88      0.88      0.88      1400
           4       0.89      0.93      0.91      1400
           5       0.91      0.88      0.90      1400
           6       0.95      0.94      0.95      1400
           7       0.95      0.91      0.93      1400
           8       0.91      0.84      0.87      1400
           9       0.89      0.88      0.88      1400

    accuracy                           0.91     14000
   macro avg       0.91      0.91      0.91     14000
weighted avg       0.91      0.91      0.91     14000



In [13]:
res = cross_val_score(svm_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(f"Average Accuracy: \t {np.mean(res):.4}")
print(f"Accuracy SD: \t\t {np.std(res):.4}")

Average Accuracy: 	 0.9607
Accuracy SD: 		 0.002045


In [14]:
y_train_pred = cross_val_predict(svm_clf, X_train, y_train, cv=5, n_jobs=-1)

In [15]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1400
           1       0.99      0.99      0.99      1400
           2       0.95      0.94      0.95      1400
           3       0.96      0.94      0.95      1400
           4       0.95      0.97      0.96      1400
           5       0.97      0.96      0.96      1400
           6       0.98      0.97      0.97      1400
           7       0.97      0.95      0.96      1400
           8       0.92      0.96      0.94      1400
           9       0.94      0.94      0.94      1400

    accuracy                           0.96     14000
   macro avg       0.96      0.96      0.96     14000
weighted avg       0.96      0.96      0.96     14000



In [16]:
print(confusion_matrix(y_train, y_train_pred))

[[1379    0    3    1    4    1    3    1    7    1]
 [   0 1380    8    2    2    0    0    2    5    1]
 [   6    5 1323   11    5    2    6    8   28    6]
 [   1    1   16 1312    2   19    3    9   28    9]
 [   0    2    5    0 1359    1    4    0    5   24]
 [   2    1    4   19    3 1338   12    0   15    6]
 [   6    2    4    0    7    8 1363    1    8    1]
 [   2    4    9    1   19    0    0 1335    5   25]
 [   2    4    8   10    4   15    4    3 1340   10]
 [   4    2   11   10   21    2    1   16   12 1321]]


In [19]:
svm_clf = SVC(kernel='poly', gamma=0.1, C=0.1)
lr_clf = SGDClassifier(loss='log')
rf_clf = RandomForestClassifier()

In [20]:
voting_clf = VotingClassifier(
                estimators=[('lr', lr_clf), ('rf', rf_clf), ('svc', svm_clf)],
                voting='hard'
)

In [21]:
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              SGDClassifier(alpha=0.0001, average=False,
                                            class_weight=None,
                                            early_stopping=False, epsilon=0.1,
                                            eta0=0.0, fit_intercept=True,
                                            l1_ratio=0.15,
                                            learning_rate='optimal', loss='log',
                                            max_iter=1000, n_iter_no_change=5,
                                            n_jobs=None, penalty='l2',
                                            power_t=0.5, random_state=None,
                                            shuffle=True, tol=0.001,
                                            validation_fraction=0.1, verbose=0,
                                            w...
                                                     n_jobs=None,
                                             

In [22]:
res = cross_val_score(voting_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(f"Average Accuracy: \t {np.mean(res):.4}")
print(f"Accuracy SD: \t\t {np.std(res):.4}")

Average Accuracy: 	 0.9447
Accuracy SD: 		 0.002363


In [23]:
y_train_pred = cross_val_predict(voting_clf, X_train, y_train, cv=5, n_jobs=-1)

In [24]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1400
           1       0.96      0.98      0.97      1400
           2       0.91      0.94      0.93      1400
           3       0.93      0.91      0.92      1400
           4       0.93      0.96      0.94      1400
           5       0.95      0.93      0.94      1400
           6       0.97      0.96      0.97      1400
           7       0.97      0.93      0.95      1400
           8       0.92      0.93      0.93      1400
           9       0.94      0.91      0.93      1400

    accuracy                           0.94     14000
   macro avg       0.94      0.94      0.94     14000
weighted avg       0.94      0.94      0.94     14000



In [25]:
print(confusion_matrix(y_train, y_train_pred))

[[1377    0    1    0    4    2    4    1   10    1]
 [   0 1375    8    2    1    3    1    1    6    3]
 [  10   12 1315    8   10    2    6    7   26    4]
 [   3    7   38 1278    2   25    2   13   27    5]
 [   3    6   11    4 1338    0    6    2    5   25]
 [  15    4   10   33    5 1302   11    0   13    7]
 [  10    4    7    3    8   12 1347    1    8    0]
 [   5    8   22    6   21    0    0 1308    5   25]
 [  10    9   12   16   13   13    5    3 1308   11]
 [  13    5   15   28   30    5    1   12   13 1278]]


In [26]:
y_test_pred = voting_clf.predict(X_test)

In [27]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       600
           1       0.97      0.98      0.97       600
           2       0.92      0.96      0.94       600
           3       0.93      0.91      0.92       600
           4       0.94      0.95      0.95       600
           5       0.96      0.94      0.95       600
           6       0.98      0.95      0.97       600
           7       0.97      0.95      0.96       600
           8       0.94      0.94      0.94       600
           9       0.95      0.93      0.94       600

    accuracy                           0.95      6000
   macro avg       0.95      0.95      0.95      6000
weighted avg       0.95      0.95      0.95      6000



In [28]:
print(confusion_matrix(y_test, y_test_pred))

[[597   0   1   0   0   0   1   0   1   0]
 [  0 589   5   2   2   1   0   0   1   0]
 [  6   1 579   3   2   0   2   1   5   1]
 [  4   1  17 548   1   6   0   6  12   5]
 [  1   3   5   1 571   1   4   0   4  10]
 [  3   1   5  21   5 561   0   1   2   1]
 [  5   2   5   0   5   6 571   1   5   0]
 [  1   4   6   2   6   1   0 572   0   8]
 [  4   5   6   4   3  10   3   1 561   3]
 [  4   3   2   9  13   1   0   6   3 559]]
