**Problem Statement:** Implement a basic character recognition system of handwritten digits using logistic regression. The data set provided contains instances of the 10 digits available (0 through 9). Your goal is to train and test 10 different logistic regression models, one per digit.

Data Source: https://drive.google.com/file/d/1uMCvIV-KPzFQBeOCVvFWqvISUUrySLzH/view

Each image is encoded as a row of 784 integer values between 0 and 255 indicating the brightness of each pixel. The label associated with each image is encoded as an integer value between 0 and 9. The file contain 785 columns. The first column corresponds to the digit labels (0-9) and the remaining 784 columns correspond to the pixel values of the 28x28=784 pixels of the image.



In [None]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from numpy import exp

In [None]:
#Importing train and test datasets
train = pd.read_csv("mnist_train.csv")
test = pd.read_csv("mnist_test.csv")

As per Prof William M.K. Trochim by using dummy variable to represent the subgroups of the dataset. Dummy varible helps in utilizing one regression equation to showcase multiple groups. 

Source: www.shorturl.at/pGMPW

In [None]:
#Applying the dummy variable function
train_dummy = pd.get_dummies(train['5']) 
test_dummy = pd.get_dummies(test['7']) 

In [None]:
data_train = pd.merge(train_dummy,train.iloc[:,1:],how='left',left_index=True,right_index=True)
data_test = pd.merge(test_dummy,test.iloc[:,1:],how='left',left_index=True,right_index=True)

In [None]:
import warnings 
warnings.filterwarnings('ignore')

#Modelling regression model for 10 digits 0 to 9
final_predictions=[]
for i in range(0,10):
#Separating the x and y values from training set
  x_train= data_train.iloc[:,11:]
  y_train = data_train.iloc[:,i]
#Separating the x and y values from testing set
  x_test= data_test.iloc[:,11:]
  y_test = data_test.iloc[:,i]  
#Appling and fitting the model
  model = LogisticRegression(penalty='l2', max_iter=200, multi_class= "ovr", random_state=0)
  model.fit(x_train,y_train)
  #print(model.score(x_test,y_test))
#Predicting the prediction probability on test set
  predictions = model.predict_proba(x_test)
  final_predictions.append(list(predictions[:,1]))

In [None]:
#Formatting the x_test probabilities into dataframe
pd.options.display.float_format = '{:.2f}'.format
x_test_probabilities = pd.DataFrame(final_predictions).T
x_test_probabilities

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.00,0.00,1.00,0.00,0.00,0.08,0.03,0.00,0.00,0.00
1,0.00,0.99,0.04,0.01,0.00,0.01,0.03,0.00,0.02,0.00
2,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,0.00,0.00,0.01,0.00,0.98,0.00,0.00,0.01,0.06,0.10
4,0.00,1.00,0.02,0.02,0.00,0.00,0.00,0.02,0.03,0.02
...,...,...,...,...,...,...,...,...,...,...
9994,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.04,0.00
9995,0.00,0.00,0.00,0.99,0.00,0.00,0.00,0.00,0.00,0.00
9996,0.00,0.00,0.00,0.00,0.89,0.00,0.00,0.00,0.01,0.03
9997,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.07,0.00


In [None]:
# Calculating the softmax of a vector
def softmax(vector):
	value = exp(vector)
	return value / value.sum()
 
sm_results = []
for i in x_test_probabilities.values:
  sm_results.append(softmax(i))

results = pd.DataFrame(sm_results)

In [None]:
def max_probability(x):
  list_1 = list(x.values)
  max_proba = max(list_1)
  return list_1.index(max_proba)

In [None]:
#Showcasing the final predictions
results['final_prediction'] = results.apply(max_probability,axis=1)
results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,final_prediction
0,0.08,0.08,0.23,0.08,0.08,0.09,0.09,0.08,0.08,0.08,2
1,0.08,0.23,0.09,0.09,0.08,0.09,0.09,0.08,0.09,0.08,1
2,0.23,0.09,0.09,0.09,0.09,0.09,0.09,0.09,0.09,0.09,0
3,0.08,0.08,0.09,0.08,0.22,0.08,0.08,0.08,0.09,0.09,4
4,0.08,0.23,0.09,0.09,0.08,0.08,0.08,0.09,0.09,0.09,1
...,...,...,...,...,...,...,...,...,...,...,...
9994,0.09,0.09,0.23,0.09,0.09,0.09,0.09,0.09,0.09,0.09,2
9995,0.09,0.09,0.09,0.23,0.09,0.09,0.09,0.09,0.09,0.09,3
9996,0.09,0.09,0.09,0.09,0.21,0.09,0.09,0.09,0.09,0.09,4
9997,0.08,0.08,0.08,0.08,0.08,0.23,0.08,0.08,0.09,0.08,5


In [None]:
# Calculating the accuracy score
accuracy_score(results['final_prediction'],test['7'])

0.9181918191819182

In [None]:
#Creating the confusion matrix to analyze the results
confusion_matrix(results['final_prediction'],test['7'],labels=None, sample_weight=None, normalize=None)

array([[ 960,    0,    6,    3,    1,   10,    8,    3,    8,   10],
       [   0, 1118,    9,    1,    2,    2,    3,    6,   12,    6],
       [   1,    3,  907,   17,    3,    0,    9,   22,    6,    1],
       [   2,    1,   19,  919,    4,   40,    1,    5,   23,   16],
       [   0,    0,    8,    2,  906,   11,    4,    7,   11,   31],
       [   2,    1,    5,   21,    0,  756,   18,    2,   25,    3],
       [   7,    4,   10,    5,   12,   17,  909,    1,    7,    0],
       [   2,    1,   12,   11,    3,    9,    0,  945,   11,   26],
       [   4,    6,   53,   23,   11,   37,    6,    3,  861,   16],
       [   2,    1,    3,    8,   40,   10,    0,   33,   10,  900]])