In [1]:
import pandas as pd
import string

In [2]:
import sys
sys.path.append("C:/Users/sumaiyah/OneDrive - University Of Cambridge/Project/DNN-RE/src")

In [3]:
dataset_name = 'LetterRecognition'
target_col_name = 'letter'

RAW_DATA_PATH = 'raw_data/LetterRecognition.csv'

# Letter Recognition Dataset

[src](https://archive.ics.uci.edu/ml/datasets/Letter+Recognition)

**Multi-classification:**
- **Input:** Letters are represented with pixels that have been transformed into 16 attributes. Original Pixel graphics have been transformed to a number of 16 attributes representing special characteristics, 0-15 indicate each attribute. 
- **Output:** A-Z, The letter being described

**Binary classification:**
- **Input:** Letters are represented with pixels that have been transformed into 16 attributes. Original Pixel graphics have been transformed to a number of 16 attributes representing special characteristics, 0-15 indicate each attribute. 
- **Output:** 0 for A, 1 for any other letter

## Preprocess Data

In [4]:
target_letter = 'A'

In [5]:
raw_data = pd.read_csv(RAW_DATA_PATH)
raw_data.head()

Unnamed: 0,letter,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [6]:
# List of input features
feature_col_names = list(raw_data.columns)
feature_col_names.remove(target_col_name)

In [7]:
# Encode target data
# class_0 = target latter
# class_1 = all other letters
raw_data[target_col_name].replace({letter: 1 
                                     for letter in string.ascii_uppercase 
                                     if letter != target_letter}, 
                                  inplace=True)
raw_data[target_col_name].replace({target_letter:0}, inplace=True)

data = raw_data

In [8]:
# Make class numbers equal, right now theyre v unequal
# Shuffle and get examples from class_1 so len(class_1)==len(class_0)
class_0 = data[data[target_col_name] == 0]
class_1 = data[data[target_col_name] == 1].sample(frac=1).reset_index(drop=True)[0:len(class_0)]

# Shuffle all the data and combine into 1 dataframe
data = pd.concat((class_0, class_1)).sample(frac=1).reset_index(drop=True)
    
data[target_col_name].value_counts()

1    789
0    789
Name: letter, dtype: int64

In [9]:
# Seperate input features and target column
X = data.drop(columns=[target_col_name]).values  
y = data[target_col_name].values

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Scale input features i.e. scale attributes so that theyre 0-1 so that larger weights do not carry more signifcance in the network
scaler = MinMaxScaler() 
X = scaler.fit_transform(X)

In [11]:
# Store preprocessed data
data = pd.DataFrame(X, columns=feature_col_names)
data[target_col_name] = y
data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,letter
0,0.166667,0.066667,0.333333,0.142857,0.066667,0.5,0.133333,0.133333,0.133333,0.4,0.142857,0.461538,0.142857,0.384615,0.133333,0.5,0
1,0.333333,0.733333,0.583333,0.571429,0.333333,0.857143,0.133333,0.133333,0.133333,0.666667,0.142857,0.538462,0.142857,0.384615,0.2,0.571429,0
2,0.166667,0.4,0.416667,0.285714,0.2,0.428571,0.333333,0.066667,0.2,0.333333,0.142857,0.384615,0.142857,0.384615,0.2,0.285714,0
3,0.333333,0.533333,0.5,0.428571,0.4,0.5,0.466667,0.266667,0.266667,0.4,0.5,0.615385,0.5,0.615385,0.333333,0.5,1
4,0.416667,0.466667,0.583333,0.357143,0.266667,0.642857,0.333333,0.333333,0.4,0.533333,0.428571,0.307692,0.142857,0.461538,0.266667,0.428571,1


In [12]:
assert data.columns[-1]==target_col_name, 'Target column must be last column in DataFrame'

# Save Clean Data

In [13]:
# Initialise new empty dataset folder
from model.generation.helpers import init_dataset_dir

path_to_data_folder = '../'
init_dataset_dir.run(dataset_name=dataset_name, path_to_data_folder=path_to_data_folder)

Directory  ../LetterRecognition/  Created 
Directory  ../LetterRecognition/neural_network_initialisation  Created 
Directory  ../LetterRecognition/cross_validation  Created 


In [14]:
data_path = '../' + dataset_name + '/'

In [15]:
# Save cleaned data
data.to_csv(data_path + 'data.csv', index=False)