# Exercise 2

Topic of the exercise is classification and using PCA for dimensinality reduction. First preprocess data, and then fit three models to it:

* LogisticRegression
* LogisticRegression with data whose dimensionality is reduced with PCA
* Random Forest


In [1]:
from nose.tools import *
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier

### Fetch the MNIST data set

MNIST data set consists of images of handwritten digits and corresponding labels (indicating the true number represented by the digit).

In [2]:
from sklearn.datasets import fetch_openml
x, y = fetch_openml('mnist_784', version=1, return_X_y=True)

### Scale data and split into training and test

In [3]:

scaler = StandardScaler().fit(x)
scaled_x = scaler.transform(x)

x_train, x_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.4)


## Try out both LogisticRegression and RandomForest models

### LogisticRegression (1p)

In [4]:


lrmodel = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l2', n_jobs=4)
lrmodel.fit(x_train, y_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=4, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

### Confusion matrix and findings for LogisticRegression (1p)

In [5]:
# Print accuracies and confusion matrices for training and test sets. What is your conclusion about
# the model? Are there some digits that are harder to classify than others?
print("training accuracy", lrmodel.score(x_train, y_train))
print(confusion_matrix(lrmodel.predict(x_train), y_train))
print('---------------------------------------------------------------')
print("validation accuracy", lrmodel.score(x_test, y_test))
print(confusion_matrix(lrmodel.predict(x_test), y_test))
#

training accuracy 0.9231428571428572
[[4085    1   36   16    8   39   25   14   30   28]
 [   2 4530   43   30   27   33   19   37  130   21]
 [   9   22 3824   78   15   12   30   55   38   10]
 [  12   14   52 3769    7  109    1   15   89   55]
 [   7    5   61   10 3833   41   36   39   32  127]
 [  23   20    8  130    4 3339   47    5  110   25]
 [  30    7   48   14   32   74 3969    4   33    3]
 [  10    8   62   47   14   20    5 4101   24  126]
 [  21   52   85   63   22   81   18    6 3515   21]
 [   3    6   15   35  122   32    1  137   55 3807]]
---------------------------------------------------------------
validation accuracy 0.9171428571428571
[[2620    0   24   17    6   39   20   10   25   12]
 [   1 3135   35   20   11   21   10   25   96   16]
 [   9   11 2441   65   20   15   15   22   28   11]
 [   6    6   42 2611    4   78    1   12   74   38]
 [   7    3   33    5 2560   33   21   37   20   91]
 [  16   13   13  100    3 2196   25    4   79   11]
 [  17    3

### LogisticRegression with PCA (1p)

Reduce the dimensionality of both training and test sets to 30 and train a logistic regression model on the transformed data set.

In [9]:
# Get the PCA data.
pca_model = PCA(n_components=30).fit(x)
pca_data = pca_model.transform(x)


# Split the data.
x_train, x_test, y_train, y_test = train_test_split(
    pca_data,
    y,
    test_size=0.4
)

# Fit the logistic regression model.
lr_model = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l2', n_jobs=4).fit(x_train, y_train)

# Get predictions and their confusion matrix.
print("training accuracy", lr_model.score(x_train, y_train))
print(confusion_matrix(lr_model.predict(x_train), y_train))
print('---------------------------------------------------------------')
print("validation accuracy", lr_model.score(x_test, y_test))
print(confusion_matrix(lr_model.predict(x_test), y_test))



training accuracy 0.8828095238095238
[[3985    1   50   38   11  104   45   18   57   42]
 [   3 4554   68   26   31   46   18   52  169   31]
 [  18   25 3526   91   27   40   47   82   77   40]
 [  13   16   83 3663    1  181    3   12  165   61]
 [  11    4   89   10 3709   89   59   61   44  196]
 [  59   19   17  205    7 2939   77   10  128   59]
 [  43    6  110   31   54  110 3835    1   56    2]
 [  13   13   95   61   20   36    4 3953   45  185]
 [  21   68   99   87   26  119   21   10 3327   34]
 [   8    2   49   71  200   59    2  133   92 3587]]
---------------------------------------------------------------
validation accuracy 0.8836071428571428
[[2620    0   25   24    8   77   28   22   33   25]
 [   0 3061   50   26   22   28   15   39  113   16]
 [  12   20 2414   73   14   24   27   43   46   25]
 [   8    5   44 2400    6  126    3    4  114   30]
 [   5    1   57    6 2485   46   40   48   29  168]
 [  51   14   16  152    7 2086   43    5  104   33]
 [  14    4

### RandomForest (1p)

In [10]:
rfmodel = RandomForestClassifier()#
rfmodel.fit(x_train, y_train)#

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Confusion matrix and findings for RandomForest (1p)

In [11]:
# Print accuracies and confusion matrices for training and test sets. What is your conclusion about
# the model? Are there some digits that are harder to classify than others?

#
print("training accuracy", rfmodel.score(x_train, y_train))
print(confusion_matrix(rfmodel.predict(x_train), y_train))
print('---------------------------------------------------------------')
print("validation accuracy", rfmodel.score(x_test, y_test))
print(confusion_matrix(rfmodel.predict(x_test), y_test))

training accuracy 1.0
[[4174    0    0    0    0    0    0    0    0    0]
 [   0 4708    0    0    0    0    0    0    0    0]
 [   0    0 4186    0    0    0    0    0    0    0]
 [   0    0    0 4283    0    0    0    0    0    0]
 [   0    0    0    0 4086    0    0    0    0    0]
 [   0    0    0    0    0 3723    0    0    0    0]
 [   0    0    0    0    0    0 4111    0    0    0]
 [   0    0    0    0    0    0    0 4332    0    0]
 [   0    0    0    0    0    0    0    0 4160    0]
 [   0    0    0    0    0    0    0    0    0 4237]]
---------------------------------------------------------------
validation accuracy 0.9505357142857143
[[2693    0   18    4    2   13   18    1    7    7]
 [   0 3110    5    2   12    1    6   17   15    6]
 [   4   25 2659   42    6    9    6   27   24   12]
 [   3    7   20 2630    2   49    1    3   60   35]
 [   3    2   15    0 2598   15    6   19   13   81]
 [   6    2    7   52    0 2444   19    3   48   16]
 [  12    8   10   14   20