In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append("../../../Desktop/DNN-RE/src")

In [3]:
dataset_name = 'MB-GE-ER'
target_col_name = 'ER_Expr'

RAW_DATA_PATH = 'raw_data/MB-GE-ER.csv'

# METABRIC Data Set
- **Input:** 1000 gene expression normalised (values between 0 to 1)
- **Output:** Classification, *+*: ER+, *-*: ER-. 1506 *+* and 474 *-*

## Preprocess Data

In [4]:
raw_data = pd.read_csv(RAW_DATA_PATH)
raw_data.head()

Unnamed: 0,GE_GRB7,GE_LSM1,GE_CLNS1A,GE_STARD3,GE_PPFIA1,GE_ERBB2,GE_INTS4,GE_ORMDL3,GE_PSMB3,GE_MTERFD1,...,GE_ZNHIT3,GE_FOXM1,GE_EXOC3,GE_ARMC7,GE_KCTD2,GE_PDCD10,GE_RHOD,GE_YBX1,GE_TRIM24,ER_Expr
0,6.487146,11.031136,10.079148,6.031175,9.487621,9.729606,8.253549,6.511952,10.423016,8.419615,...,7.922863,5.677913,7.426484,6.194931,5.962671,9.093493,9.407661,9.52659,7.306828,+
1,6.479166,10.87385,10.322188,5.84657,9.05311,9.725825,7.873861,6.627166,11.203083,8.515613,...,8.724618,6.054136,7.013241,6.641685,5.784853,9.33002,8.623256,9.545137,8.62484,+
2,7.874277,11.381911,9.550389,6.212065,8.822194,10.334979,7.312676,7.164516,11.910947,9.153356,...,9.420298,5.774602,7.374062,6.782999,6.65594,9.856422,9.663621,9.942894,8.278145,+
3,6.351296,10.433402,9.73649,5.885852,8.981401,9.739996,7.72891,6.66315,11.050693,8.540313,...,7.37899,6.735173,7.453835,7.132947,6.392015,8.899136,8.57982,9.795769,7.705012,+
4,6.354046,11.074266,9.968607,5.876592,8.861144,9.276507,7.643929,6.793206,11.425601,8.650009,...,9.53895,5.665971,6.726317,6.362663,6.229215,10.877762,8.127024,9.346105,7.549302,+


In [5]:
# List of input features
feature_col_names = list(raw_data.columns)
feature_col_names.remove(target_col_name)

In [6]:
# Encode target data
# class_0 = ER- = '-'
# class_1 = ER+ = '+'
raw_data[target_col_name].replace({'+':1, '-': 0}, inplace=True)

In [7]:
# Seperate input features and target column
X = raw_data.drop(columns=[target_col_name]).values  
y = raw_data[target_col_name].values

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Scale input features i.e. scale attributes so that theyre 0-1 so that larger weights do not carry more signifcance in the network
scaler = MinMaxScaler() 
X = scaler.fit_transform(X)

In [10]:
# Shuffle and store preprocessed data
data = pd.DataFrame(X, columns=feature_col_names)
data[target_col_name] = y
data.head(6)

Unnamed: 0,GE_GRB7,GE_LSM1,GE_CLNS1A,GE_STARD3,GE_PPFIA1,GE_ERBB2,GE_INTS4,GE_ORMDL3,GE_PSMB3,GE_MTERFD1,...,GE_ZNHIT3,GE_FOXM1,GE_EXOC3,GE_ARMC7,GE_KCTD2,GE_PDCD10,GE_RHOD,GE_YBX1,GE_TRIM24,ER_Expr
0,0.207166,0.403983,0.42907,0.121615,0.281057,0.405837,0.244429,0.116286,0.085777,0.352553,...,0.454167,0.149432,0.325991,0.159269,0.072478,0.613769,0.702061,0.445289,0.337619,1
1,0.206144,0.371783,0.469734,0.0907,0.190297,0.40538,0.16998,0.133813,0.262245,0.378006,...,0.591977,0.222958,0.199906,0.301413,0.02636,0.657079,0.564644,0.448852,0.748465,1
2,0.384852,0.475795,0.3406,0.151908,0.142063,0.47903,0.059943,0.215561,0.42238,0.547095,...,0.711554,0.168328,0.309997,0.346375,0.25228,0.753467,0.746902,0.525268,0.640394,1
3,0.189764,0.281614,0.371738,0.097278,0.175318,0.407093,0.141558,0.139288,0.227771,0.384555,...,0.360683,0.356054,0.334337,0.457718,0.18383,0.57818,0.557034,0.497003,0.461739,1
4,0.190117,0.412813,0.410574,0.095727,0.150199,0.351055,0.124895,0.159073,0.312584,0.413639,...,0.731949,0.147099,0.112362,0.212637,0.141607,0.940482,0.47771,0.410614,0.413202,1
5,0.284971,0.348664,0.179224,0.137436,0.13091,0.472741,0.144021,0.177227,0.206657,0.443424,...,0.333583,0.14127,0.32518,0.617195,0.376554,0.465586,0.560524,0.494444,0.483431,0


In [11]:
assert data.columns[-1]==target_col_name, 'Target column must be last column in DataFrame'

# Save Clean Data

In [14]:
# Initialise new empty dataset folder
from model.generation.helpers import init_dataset_dir

path_to_data_folder = '../'
init_dataset_dir.run(dataset_name=dataset_name, path_to_data_folder=path_to_data_folder)

Directory  ../MB-GE-ER/  already exists
Directory  ../MB-GE-ER/neural_network_initialisation  already exists
Directory  ../MB-GE-ER/cross_validation  already exists


In [15]:
data_path = '../' + dataset_name + '/'

In [16]:
# Save cleaned data
data.to_csv(data_path + 'data.csv', index=False)

In [21]:
test = pd.read_csv("../MB-GE-ER/data.csv")

In [17]:
l=[1,6,8]

In [18]:
l1 = ["feature1", "feature2", "feature3", ...]
l2 = ["feature4", "feature5", "feature6", ...]

for i in l1:
    nameOfYourDataFrame.loc[i, "nameOfColumns"] = 0
    
for i in l2:    
    nameOfYourDataFrame.loc[i, "nameOfColumns"] = 1

In [19]:
raw_data.head(5)

Unnamed: 0,GE_GRB7,GE_LSM1,GE_CLNS1A,GE_STARD3,GE_PPFIA1,GE_ERBB2,GE_INTS4,GE_ORMDL3,GE_PSMB3,GE_MTERFD1,...,GE_ZNHIT3,GE_FOXM1,GE_EXOC3,GE_ARMC7,GE_KCTD2,GE_PDCD10,GE_RHOD,GE_YBX1,GE_TRIM24,ER_Expr
0,6.487146,11.031136,10.079148,6.031175,9.487621,9.729606,8.253549,6.511952,10.423016,8.419615,...,7.922863,5.677913,7.426484,6.194931,5.962671,9.093493,9.407661,9.52659,7.306828,+
1,0.0,10.87385,10.322188,5.84657,9.05311,9.725825,7.873861,6.627166,11.203083,8.515613,...,8.724618,6.054136,7.013241,6.641685,5.784853,9.33002,8.623256,9.545137,8.62484,+
2,7.874277,11.381911,9.550389,6.212065,8.822194,10.334979,7.312676,7.164516,11.910947,9.153356,...,9.420298,5.774602,7.374062,6.782999,6.65594,9.856422,9.663621,9.942894,8.278145,+
3,6.351296,10.433402,9.73649,5.885852,8.981401,9.739996,7.72891,6.66315,11.050693,8.540313,...,7.37899,6.735173,7.453835,7.132947,6.392015,8.899136,8.57982,9.795769,7.705012,+
4,6.354046,11.074266,9.968607,5.876592,8.861144,9.276507,7.643929,6.793206,11.425601,8.650009,...,9.53895,5.665971,6.726317,6.362663,6.229215,10.877762,8.127024,9.346105,7.549302,+
