The MNIST database (Modified National Institute of Standards and Technology database) is a large database of handwritten digits that is commonly used for training various image processing systems.[1][2] The database is also widely used for training and testing in the field of machine learning.[3][4] It was created by "re-mixing" the samples from NIST's original datasets. The creators felt that since NIST's training dataset was taken from American Census Bureau employees, while the testing dataset was taken from American high school students, it was not well-suited for machine learning experiments.[5] Furthermore, the black and white images from NIST were normalized to fit into a 28x28 pixel bounding box and anti-aliased, which introduced grayscale levels.[5]

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from sklearn.cluster import KMeans


In [2]:
mnist=pd.read_csv('train.csv')
mnist.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
mnist.shape

(42000, 785)

In [4]:
mnist.describe().transpose().head(10)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,42000.0,4.456643,2.88773,0.0,2.0,4.0,7.0,9.0
pixel0,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel1,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel2,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel3,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel4,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel5,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel6,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel7,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pixel8,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
mnist[mnist.isnull()==False].sum().sort_values(ascending=False)

pixel407    5872698
pixel435    5841084
pixel408    5771584
pixel434    5691909
pixel211    5690663
pixel210    5610814
pixel602    5585233
pixel212    5581643
pixel436    5543961
pixel601    5494137
pixel380    5487604
pixel406    5465886
pixel409    5403465
pixel381    5355200
pixel463    5352989
pixel575    5322685
pixel603    5306371
pixel209    5302757
pixel629    5293300
pixel462    5292505
pixel213    5291818
pixel574    5273475
pixel183    5245572
pixel379    5229252
pixel237    5203189
pixel238    5175656
pixel433    5169994
pixel628    5151482
pixel600    5148709
pixel240    5133202
             ...   
pixel420          0
pixel392          0
pixel111          0
pixel112          0
pixel782          0
pixel196          0
pixel168          0
pixel141          0
pixel645          0
pixel671          0
pixel672          0
pixel673          0
pixel756          0
pixel755          0
pixel754          0
pixel731          0
pixel730          0
pixel52           0
pixel53           0


In [6]:
mnist.nunique()

label        10
pixel0        1
pixel1        1
pixel2        1
pixel3        1
pixel4        1
pixel5        1
pixel6        1
pixel7        1
pixel8        1
pixel9        1
pixel10       1
pixel11       1
pixel12       3
pixel13       3
pixel14       2
pixel15       2
pixel16       1
pixel17       1
pixel18       1
pixel19       1
pixel20       1
pixel21       1
pixel22       1
pixel23       1
pixel24       1
pixel25       1
pixel26       1
pixel27       1
pixel28       1
           ... 
pixel754      1
pixel755      1
pixel756      1
pixel757      1
pixel758      1
pixel759      1
pixel760      1
pixel761      3
pixel762     15
pixel763     24
pixel764     38
pixel765     42
pixel766     75
pixel767     91
pixel768     99
pixel769    114
pixel770    126
pixel771    121
pixel772    106
pixel773     81
pixel774     57
pixel775     36
pixel776     19
pixel777     10
pixel778      6
pixel779      3
pixel780      1
pixel781      1
pixel782      1
pixel783      1
Length: 785, dtype: int6

In [None]:
#Elbow method to find the optimum number of clusters
k=range(1,10)
inertias=[]
for i in k:
    mod=KMeans(n_clusters=i,random_state=42)
    mod.fit(mnist)
    inertias.append(mod.inertia_)
plt.plot(k,inertias,'-s')
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.grid()
plt.show()  

In [7]:
k_mod=KMeans(n_clusters=8,random_state=122)

In [12]:
labels=k_mod.fit_predict(mnist)
labels

array([7, 2, 7, ..., 3, 5, 1])

In [9]:
from sklearn.metrics import accuracy_score

In [11]:
accuracy_score(mnist.label,labels)*100

1.4785714285714286

# Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [14]:
x=mnist.drop('label',axis=1)
y=mnist.label

In [17]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=123)

In [18]:
lg_mod=LogisticRegression()
lg_mod.fit(xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
pred=lg_mod.predict(xtest)

In [28]:
lr_r=accuracy_score(ytest,pred)*100
lr_r

90.03174603174602

In [23]:
pd.crosstab(ytest,pred)

col_0,0,1,2,3,4,5,6,7,8,9
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1168,0,10,2,4,14,13,1,15,2
1,0,1347,17,9,3,7,2,2,22,3
2,13,34,1058,33,16,11,19,20,36,5
3,5,9,42,1109,4,40,3,13,27,16
4,5,10,9,4,1121,3,4,5,12,38
5,17,13,12,61,6,1006,20,4,38,12
6,8,9,15,1,5,27,1197,1,7,1
7,7,19,13,5,11,8,0,1180,8,47
8,9,33,11,40,7,27,15,7,1067,21
9,8,4,3,23,40,14,1,41,15,1091


# DEcision tree

In [31]:
# Decision tree
from     sklearn.tree      import    DecisionTreeClassifier
# Model Comparison's for 4 models
DT_model=DecisionTreeClassifier(criterion='entropy',random_state=123)
DT_model.fit(xtrain,ytrain)




DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [32]:
pred1=DT_model.predict(xtest)

In [33]:
dt_r=accuracy_score(ytest,pred1)*100
dt_r

86.07142857142858

# Random forest
from     sklearn.ensemble import RandomForestClassifier

In [34]:
from     sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [35]:
Rf_model=RandomForestClassifier(random_state=123)
Rf_model.fit(xtrain,ytrain)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [36]:
pred2=Rf_model.predict(xtest)

In [39]:
Rf_r=accuracy_score(ytest,pred2)*100
Rf_r

94.01587301587301