In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append("C:/Users/sumaiyah/OneDrive - University Of Cambridge/Project/DNN-RE/src")

In [3]:
dataset_name = 'MNIST'
target_col_name = 'digit'

# MNIST Dataset
MNIST handwritten digit letter recognition dataset  

**Multi-classification:**
- **Input:** Letters are represented with 8x8 image, 784 pixels 
- **Output:** 0-9, The digit

**Binary classification:**
- **Input:** Letters are represented with 8x8 image, 784 pixels 
- **Output:** 0 if digit=1 else 1

In [4]:
# Get data from tensorflow and preprocess
import tensorflow as tf

# Load data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Get all x and y data together
# X = np.concatenate((x_train, x_test), axis=0).astype('float32')
# y = np.concatenate((y_train, y_test), axis=0)
X = np.array(x_train, dtype='float32')
y = y_train

# Normalizing the RGB codes by dividing it to the max RGB value
X /= 255

# Flatten the images
image_vector_size = 28 * 28
X = X.reshape(X.shape[0], image_vector_size)

# Put data processed into dataframe
data = pd.DataFrame(data=X, columns=[i + 1 for i in range(0, 784)])
data['digit'] = y
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,776,777,778,779,780,781,782,783,784,digit
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


## Pre-process Data

In [5]:
target_digit = 1

In [8]:
# List of input features
feature_col_names = list(data.columns)
feature_col_names.remove(target_col_name)

In [9]:
# Encode target data
# class_0 = target digit
# class_1 = all other digits

# Convert into binary classification task, convert numbers other than target into 1
# Do not do this in place
class_0 = data[data[target_col_name] == target_digit].copy()
class_1 = data[data[target_col_name] != target_digit].copy().sample(frac=1).reset_index(drop=True)[0:len(class_0)]
class_0.loc[:, target_col_name] = 0
class_1.loc[:, target_col_name] = 1

# Shuffle all the data and combine into 1 dataframe
data = pd.concat((class_0, class_1)).sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,776,777,778,779,780,781,782,783,784,digit
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [11]:
# Seperate input features and target column
X = data.drop(columns=[target_col_name]).values  
y = data[target_col_name].values

In [12]:
from sklearn.preprocessing import MinMaxScaler

# Scale input features i.e. scale attributes so that theyre 0-1 so that larger weights do not carry more signifcance in the network
scaler = MinMaxScaler() 
X = scaler.fit_transform(X)

In [13]:
# Store preprocessed data
data = pd.DataFrame(X, columns=feature_col_names)
data[target_col_name] = y
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,776,777,778,779,780,781,782,783,784,digit
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [14]:
assert data.columns[-1]==target_col_name, 'Target column must be last column in DataFrame'

# Save Clean Data

In [15]:
# Initialise new empty dataset folder
from model.generation.helpers import init_dataset_dir

path_to_data_folder = '../'
init_dataset_dir.run(dataset_name=dataset_name, path_to_data_folder=path_to_data_folder)

Directory  ../MNIST  Created 
Directory  ../MNIST/split_indices  Created 
Directory  ../MNIST/information  Created 
Directory  ../MNIST/models  Created 
Directory  ../MNIST/labels  Created 


In [16]:
data_path = '../' + dataset_name + '/'

In [17]:
# Save cleaned data
data.to_csv(data_path + 'data.csv', index=False)