In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append("C:/Users/sumaiyah/OneDrive - University Of Cambridge/Project/DNN-RE/src")

In [3]:
dataset_name = 'BreastCancer'
target_col_name = 'diagnosis'

RAW_DATA_PATH = 'raw_data/wdbc.csv'

# Breast Cancer Wisconsin (Diagnostic) Data Set
[src](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)  
- **Input:** 30 real-valued input features describing digitzed images of breast mass cell nuclei
- **Output:** Classification, *M*: malignant, *B*: benign. 212 *M* and 357 *B*

## Preprocess Data

In [4]:
raw_data = pd.read_csv(RAW_DATA_PATH)
raw_data.head()

Unnamed: 0,diagnosis,radius_1,texture_1,perimeter_1,area_1,smoothness_1,compactness_1,concavity_1,concavepoints_1,symmerty_1,...,radius_3,texture_3,perimeter_3,area_3,smoothness_3,compactness_3,concavity_3,concavepoints_3,symmerty_3,fractaldimension_3
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
# List of input features
feature_col_names = list(raw_data.columns)
feature_col_names.remove(target_col_name)

In [6]:
# Encode target data
# class_0 = malignant = 'M'
# class_1 = benign = 'B'
raw_data[target_col_name].replace({'M':0, 'B': 1}, inplace=True)

In [7]:
# Seperate input features and target column
X = raw_data.drop(columns=[target_col_name]).values  
y = raw_data[target_col_name].values

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Scale input features i.e. scale attributes so that theyre 0-1 so that larger weights do not carry more signifcance in the network
scaler = MinMaxScaler() 
X = scaler.fit_transform(X)

In [9]:
# Store preprocessed data
data = pd.DataFrame(X, columns=feature_col_names)
data[target_col_name] = y
data.head()

Unnamed: 0,radius_1,texture_1,perimeter_1,area_1,smoothness_1,compactness_1,concavity_1,concavepoints_1,symmerty_1,fractaldimension_1,...,texture_3,perimeter_3,area_3,smoothness_3,compactness_3,concavity_3,concavepoints_3,symmerty_3,fractaldimension_3,diagnosis
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864,0
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878,0
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433,0
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711,0
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595,0


In [10]:
assert data.columns[-1]==target_col_name, 'Target column must be last column in DataFrame'

# Save Clean Data

In [11]:
# Initialise new empty dataset folder
from model.generation.helpers import init_dataset_dir

path_to_data_folder = '../'
init_dataset_dir.run(dataset_name=dataset_name, path_to_data_folder=path_to_data_folder)

Directory  ../BreastCancer/  Created 
Directory  ../BreastCancer/neural_network_initialisation  Created 
Directory  ../BreastCancer/cross_validation  Created 


In [12]:
data_path = '../' + dataset_name + '/'

In [13]:
# Save cleaned data
data.to_csv(data_path + 'data.csv', index=False)