In [2]:
import pandas as pd

In [16]:
import sys
sys.path.append("../../DNN-RE/src")

In [8]:
dataset_name = 'TCGA-PANCAN'
target_col_name = 'TCGA'

RAW_DATA_PATH = 'raw_data/TCGA-PANCAN.csv'

# TCGA-PANCAN Data Set
- **Input:** 20502 gene expression 
- **Output:** Classification, BRCA (300), KIRC (146), LUAD (141), PRAD (136), COAD (78).

## Preprocess Data

In [7]:
raw_data = pd.read_csv(RAW_DATA_PATH)
raw_data.drop(columns=[raw_data.columns[0]], inplace=True)
raw_data.head(2)

Unnamed: 0,GE_C6ORF125,GE_C6ORF124,GE_C6ORF127,GE_C6ORF126,GE_RNF10,GE_C6ORF120,GE_C6ORF123,GE_C6ORF122,GE_GTF2IP1,GE_REM1,...,GE_NPY5R,GE_GNGT2,GE_GNGT1,GE_TULP3,GE_PTRF,GE_BCL6B,GE_GSTK1,GE_SELP,GE_SELS,TCGA
0,10.795139,5.064279,1.010279,1.598651,11.581064,9.405622,2.185898,3.478079,12.622386,7.031572,...,1.334282,5.042242,0.0,8.582691,13.711968,8.638834,11.139436,8.95857,11.497238,PRAD
1,10.47332,3.593007,0.0,0.0,11.783248,9.017212,0.811142,2.094168,12.768244,1.706508,...,0.0,5.783399,0.0,9.576717,10.556142,7.965842,11.715482,5.673935,9.618864,LUAD


In [9]:
# List of input features
feature_col_names = list(raw_data.columns)
feature_col_names.remove(target_col_name)

In [10]:
# Encode target data
# class_0 = 'BRCA'
# class_1 = 'KIRC'
# class_2 ='LUAD'
# class_3 = 'PRAD'
# class_4 = 'COAD'
raw_data[target_col_name].replace({'BRCA':0, 'KIRC':1, 'LUAD':2, 'PRAD':3, 'COAD':4}, inplace=True)

In [11]:
# Seperate input features and target column
X = raw_data.drop(columns=[target_col_name]).values  
y = raw_data[target_col_name].values

In [12]:
# Normalise input features i.e. scale attributes so that theyre 0-1 so that larger weights do not carry more signifcance in the network
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
X = scaler.fit_transform(X)

In [13]:
# Store preprocessed data
data = pd.DataFrame(X, columns=feature_col_names)
data[target_col_name] = y
data.head(3)

Unnamed: 0,GE_C6ORF125,GE_C6ORF124,GE_C6ORF127,GE_C6ORF126,GE_RNF10,GE_C6ORF120,GE_C6ORF123,GE_C6ORF122,GE_GTF2IP1,GE_REM1,...,GE_NPY5R,GE_GNGT2,GE_GNGT1,GE_TULP3,GE_PTRF,GE_BCL6B,GE_GSTK1,GE_SELP,GE_SELS,TCGA
0,0.563625,0.613903,0.106702,0.191891,0.430613,0.830208,0.222853,0.528681,0.555251,0.739136,...,0.117458,0.537542,0.0,0.254679,0.810155,0.574122,0.673146,0.848337,0.646996,3
1,0.514822,0.435552,0.0,0.0,0.502248,0.795924,0.082696,0.318321,0.600241,0.179383,...,0.0,0.629735,0.0,0.508068,0.416198,0.4966,0.751673,0.537296,0.179977,2
2,0.524747,0.859935,0.0,0.0,0.445552,0.903406,0.046142,0.229094,0.385349,0.300723,...,0.0,0.187482,0.0,0.581856,0.216179,0.303846,0.0,0.270909,0.236619,3


In [14]:
assert data.columns[-1]==target_col_name, 'Target column must be last column in DataFrame'

# Save Clean Data

In [17]:
# Initialise new empty dataset folder
from model.generation.helpers import init_dataset_dir

path_to_data_folder = '../'
init_dataset_dir.run(dataset_name=dataset_name, path_to_data_folder=path_to_data_folder)

Directory  ../TCGA-PANCAN/  already exists
Directory  ../TCGA-PANCAN/neural_network_initialisation  Created 
Directory  ../TCGA-PANCAN/cross_validation  Created 


In [18]:
data_path = '../' + dataset_name + '/'

In [19]:
# Save cleaned data
data.to_csv(data_path + 'data.csv', index=False)