# Challenge 2
This notebok is for challenge 2: MNIST database with Logistic regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import time 
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## Importing dataset

In [2]:
%matplotlib inline
from sklearn.datasets import fetch_mldata

# Change data_home to wherever to where you want to download your data
#data_set = fetch_mldata('MNIST original')
df_orig = pd.read_csv('mnist_train.csv')

### Checking dataset properties

#Print to show there are 1797 images (8 by 8 images for a dimensionality of 64)
print("Image Data Shape" , data_set.data.shape)

#Print to show there are 1797 labels (integers from 0-9)
print("Label Data Shape", data_set.target.shape)

In [3]:
print(df_orig.shape)
df_orig.head(5)

(60000, 786)


Unnamed: 0,Id,Category,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#df= df_orig
df = df_orig.loc[:, (df_orig != 0).any(axis=0)]
print(df.shape)

(60000, 719)


#### Print training and testing data shape

In [5]:
#Split train-test into 60k-10k
from sklearn.model_selection import train_test_split


#select columns for data/target
#data = df.loc[:,df.columns!='Category']
#target = df.loc[:,'Category']

data = df.drop(['Id', 'Category'], axis=1)
target = df['Category']

X_train = data
y_train = target

#X=image, y=label
#X_train, X_test, y_train, y_test = train_test_split(
#    data, target, train_size=1.0, random_state=0)


In [6]:
print(X_train.shape)
print(y_train.shape)

(60000, 717)
(60000,)


print(X_test.shape)
#print(y_test.shape)

In [7]:
pipe = Pipeline([
    ('pca', PCA()),
    ('clf', LogisticRegression())
])

In [17]:
param_grid = [
    {
        'pca__n_components': [5,10,20],
        'clf__solver': ['lbfgs','liblinear', 'saga'],
        'clf__C': [0.1, 1],
        'clf__tol' : [0.0001, 0.01], 
        'clf__multi_class' : ['ovr']
    }]

In [18]:
grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=5, scoring='accuracy')

In [19]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=5 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=5, score=0.6390221955608878, total=   3.5s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.7s remaining:    0.0s


[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=5, score=0.6432321616080804, total=   3.0s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.0s remaining:    0.0s


[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=5, score=0.6480472070810621, total=   2.8s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.1s remaining:    0.0s


[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=10, score=0.7743451309738052, total=   3.6s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=10 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   14.0s remaining:    0.0s


[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=10, score=0.7732886644332216, total=   3.6s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=10 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=10, score=0.7829174376156424, total=   4.0s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=20 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=20, score=0.8586782643471306, total=   4.1s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=20 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=20, score=0.8553927696384819, total=   3.8s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, clf__tol=0.0001, pca__n_components=20 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=lbfgs, c



[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5, score=0.5496400719856028, total=  23.6s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5, score=0.5536276813840691, total=  20.2s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5, score=0.5518827824173625, total=  22.3s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10, score=0.6839132173565287, total=  25.8s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10, score=0.6882844142207111, total=  26.0s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10, score=0.6936040406060909, total=  26.6s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20, score=0.775994801039792, total=  32.2s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20, score=0.7773888694434722, total=  32.0s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20 




[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20, score=0.7939190878631794, total=  32.6s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5, score=0.5423915216956608, total=   3.6s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5, score=0.5463773188659433, total=   4.0s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5, score=0.5469820473070961, total=   4.0s
[CV] clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=10 
[CV]  clf__C=0.1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_componen

[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.0001, pca__n_components=20, score=0.8632794919237886, total=  46.0s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.01, pca__n_components=5, score=0.6075284943011398, total=   5.7s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.01, pca__n_components=5, score=0.6225311265563278, total=   5.7s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.01, pca__n_components=5, score=0.6169925488823323, total=   5.6s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf__tol=0.01, pca__n_components=10 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=liblinear, clf



[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5, score=0.5497400519896021, total=  20.3s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5, score=0.5544277213860693, total=  20.3s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=5, score=0.5522828424263639, total=  20.5s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10, score=0.6837132573485303, total=  25.0s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10, score=0.687884394219711, total=  26.9s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=10, score=0.6937540631094664, total=  26.2s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20, score=0.7758948210357929, total=  36.8s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20, score=0.7768388419420971, total=  32.2s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20 




[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.0001, pca__n_components=20, score=0.7942191328699305, total=  34.4s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5, score=0.5414917016596681, total=   5.1s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5, score=0.5480774038701935, total=   4.3s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=5, score=0.545781867280092, total=   4.2s
[CV] clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=10 
[CV]  clf__C=1, clf__multi_class=ovr, clf__solver=saga, clf__tol=0.01, pca__n_components=10, score=0.6773

[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 25.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'pca__n_components': [5, 10, 20], 'clf__solver': ['lbfgs', 'liblinear', 'saga'], 'clf__C': [0.1, 1], 'clf__tol': [0.0001, 0.01], 'clf__multi_class': ['ovr']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=5)

In [20]:
grid.best_params_

{'clf__C': 0.1,
 'clf__multi_class': 'ovr',
 'clf__solver': 'lbfgs',
 'clf__tol': 0.01,
 'pca__n_components': 20}

In [21]:
results = pd.DataFrame(grid.cv_results_)
results.sort_values(by='rank_test_score', inplace=True)
results.head(5)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__multi_class,param_clf__solver,param_clf__tol,param_pca__n_components,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
5,3.825523,0.235763,0.122972,0.005828,0.1,ovr,lbfgs,0.01,20,"{'clf__C': 0.1, 'clf__multi_class': 'ovr', 'cl...",...,0.855643,0.86403,0.8595,0.003457,1,0.861586,0.861553,0.858836,0.860658,0.001289
20,4.597471,0.493966,0.137408,0.015498,1.0,ovr,lbfgs,0.0001,20,"{'clf__C': 1, 'clf__multi_class': 'ovr', 'clf_...",...,0.855493,0.86368,0.8594,0.003352,2,0.861686,0.861478,0.858561,0.860575,0.001427
23,3.613656,0.032447,0.120192,0.00102,1.0,ovr,lbfgs,0.01,20,"{'clf__C': 1, 'clf__multi_class': 'ovr', 'clf_...",...,0.855343,0.86383,0.859267,0.003494,3,0.861711,0.861403,0.858536,0.86055,0.00143
2,3.754451,0.152698,0.117935,0.000741,0.1,ovr,lbfgs,0.0001,20,"{'clf__C': 0.1, 'clf__multi_class': 'ovr', 'cl...",...,0.855393,0.86343,0.859167,0.003299,4,0.861786,0.861578,0.858786,0.860717,0.001368
26,43.538128,1.676947,0.121336,0.0043,1.0,ovr,liblinear,0.0001,20,"{'clf__C': 1, 'clf__multi_class': 'ovr', 'clf_...",...,0.854743,0.863279,0.858817,0.003496,5,0.861236,0.860903,0.857736,0.859958,0.001578


### Using best params to get final results

In [37]:
df_test = pd.read_csv('mnist_test.csv')
df_train = df_orig
print(df_train.shape)
print(df_test.shape)

(60000, 786)
(10000, 785)


In [38]:
drop_col = df_train.columns[(df_train == 0).all()]
print(drop_col)

Index(['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9', 'pixel10', 'pixel11', 'pixel16',
       'pixel17', 'pixel18', 'pixel19', 'pixel20', 'pixel21', 'pixel22',
       'pixel23', 'pixel24', 'pixel25', 'pixel26', 'pixel27', 'pixel28',
       'pixel29', 'pixel30', 'pixel31', 'pixel52', 'pixel53', 'pixel54',
       'pixel55', 'pixel56', 'pixel57', 'pixel82', 'pixel83', 'pixel84',
       'pixel85', 'pixel111', 'pixel112', 'pixel140', 'pixel141', 'pixel168',
       'pixel476', 'pixel560', 'pixel644', 'pixel645', 'pixel671', 'pixel672',
       'pixel673', 'pixel699', 'pixel700', 'pixel701', 'pixel727', 'pixel728',
       'pixel729', 'pixel730', 'pixel754', 'pixel755', 'pixel756', 'pixel757',
       'pixel758', 'pixel759', 'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object')


In [39]:
df_train = df_train.drop(drop_col, axis=1)
print(df.shape)

(60000, 719)


In [40]:
df_test=df_test.drop(drop_col, axis=1)
print(df_test.shape)

(10000, 718)


In [42]:
x_tr = df_train.drop(['Id', 'Category'], axis=1)
y_tr = df_train['Category']

In [43]:
pca = PCA(n_components=20)
pca.fit(x_tr)
pca.transform(x_tr)

array([[ 123.93258865, -312.67426199,  -24.514052  , ...,  308.84652501,
         277.48374522,  165.17193004],
       [1011.71837587, -294.857038  ,  596.33956135, ...,   40.2223858 ,
          51.88051463, -101.00211549],
       [ -51.84960807,  392.17315278, -188.5097499 , ..., -121.28226922,
         -10.23393119, -435.10942482],
       ...,
       [-178.05344964,  160.07821197, -257.61308111, ...,  -38.1161328 ,
         103.4258833 ,  236.79376043],
       [ 130.60607206,   -5.59193595,  513.85867394, ..., -143.58985721,
         -76.78571976,  194.29466655],
       [-173.43595243,  -24.71880224,  556.01889478, ...,  270.34671324,
          96.92665424, -209.48191966]])

In [44]:
clf = LogisticRegression(solver='lbfgs', max_iter=200, multi_class='ovr', tol=0.01, C=0.1)

In [45]:
clf.fit(x_tr, y_tr)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.01,
          verbose=0, warm_start=False)

In [46]:
x_test = df_test.drop(['Id'], axis=1)
pca.transform(x_test)
y_pred = clf.predict(x_test)

In [47]:
df_test['Category']=y_pred

In [48]:
df_test.head(5)

Unnamed: 0,Id,pixel12,pixel13,pixel14,pixel15,pixel32,pixel33,pixel34,pixel35,pixel36,...,pixel771,pixel772,pixel773,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,Category
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [49]:
df_final = df_test[['Id','Category']]
df_final.to_csv('2challenge_logreg.csv', index=False)

In [50]:
df_final.head(5)

Unnamed: 0,Id,Category
0,1,7
1,2,2
2,3,1
3,4,0
4,5,4
