In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

import warnings
warnings.simplefilter(action='ignore')

In [3]:
data = pd.read_csv("mnist_test.csv")

In [4]:
data

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data.drop("label", axis=1), data["label"], test_size=0.2, random_state=42)

train_test_split is a function from the sklearn.model_selection module that allows you to split your dataset into training and testing subsets.

Here's what each of the arguments do:

data.drop("label", axis=1): this selects all columns from the data DataFrame except for the "label" column, which contains the target variable we want to predict.

data["label"]: this selects the "label" column, which contains the target variable we want to predict.

test_size=0.2: this specifies that we want to split our data into a training set (80% of the data) and a test set (20% of the data).

random_state=42: this sets the random seed for the split, which ensures that the split is reproducible (i.e., running the code multiple times will always result in the same split).

The function returns four variables:

train_data: a DataFrame containing the training data (i.e., the input features for the training set)

test_data: a DataFrame containing the test data (i.e., the input features for the test set)

train_labels: a Series containing the target labels for the training set

test_labels: a Series containing the target labels for the test set

So, in summary, the code you posted is splitting the MNIST dataset into training and testing subsets. The input features (i.e., the pixel values for each image) are stored in train_data and test_data, and the target labels (i.e., the digit that each image represents) are stored in train_labels and test_labels.

In [13]:
# Convert the data to numpy arrays
X_train, X_test = np.array(train_data), np.array(test_data)
Y_train, Y_test = np.array(train_labels), np.array(test_labels)

In [14]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(8000, 784) (8000,) (2000, 784) (2000,)


In [15]:
def sigmoid(X):
    return 1.0/(1.0 + np.exp(-X))

In [16]:
def hypothesis(X, theta):
    return sigmoid(np.dot(X, theta))

In [17]:
def cost_function(X, Y, theta):
    h_theta = hypothesis(X, theta)
    cost = np.mean(Y*np.log(h_theta) + (1 - Y)*np.log(1 - h_theta))
    cost = cost*-1

    return cost

In [18]:
def gradient(X, Y, theta):

    h_theta = hypothesis(X, theta)

    grad = np.dot(X.T, (Y - h_theta))

    return grad/X.shape[0]

In [19]:

def gradient_descent(X, Y, learning_rate = 0.001, max_steps = 500):

    m, n = X.shape
    theta = np.zeros((n, 1))
    cost_epoch = []

    for i in range(max_steps):

        grad = gradient(X, Y, theta)

        e = cost_function(X, Y, theta)
        cost_epoch.append(e)

        theta = theta + learning_rate*grad

    return (theta, cost_epoch)

In [20]:
ones = np.ones((X_train.shape[0], 1))
X_train = np.hstack((ones, X_train))
print(X_train[:4, :4], X_train.shape)
Y_train = Y_train.reshape((-1, 1))



[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] (8000, 785)


In [None]:
theta, cost_epoch = gradient_descent(X_train, Y_train, learning_rate=0.001, max_steps = 1000)

In [None]:
cost_epoch[-10:]

In [None]:
plt.plot(cost_epoch)

In [None]:
ones = np.ones((X_test.shape[0], 1))
X_test = np.hstack((ones, X_test))
print(X_test[:4, :4], X_test.shape)
y_test = np.array(y_test)
y_test = y_test.reshape((-1, 1))

In [None]:
def predict(X, theta):
    h_theta = hypothesis(X, theta)
    output = np.zeros(h_theta.shape)

    output[h_theta>0.5] = 1
    output = output.astype('int')

    return output

In [None]:
train_preds = predict(X_train, theta)
test_preds = predict(X_test, theta)

In [None]:
for i in range(len(test_preds)):
    print(test_preds[i], y_test[i])

In [None]:
def accuracy(preds, labels):
    labels = labels.astype('int')
    print(np.sum(preds == labels)/labels.shape[0])

In [None]:
accuracy(test_preds, y_test)

In [None]:
accuracy(train_preds, y_train)