<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Gene-Expression-Data-Cleaning" data-toc-modified-id="Gene-Expression-Data-Cleaning-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Gene Expression Data Cleaning</a></span></li><li><span><a href="#Clinical-Classification-Target-Extraction" data-toc-modified-id="Clinical-Classification-Target-Extraction-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Clinical Classification Target Extraction</a></span></li><li><span><a href="#Data-Harmonization-and-Normalization" data-toc-modified-id="Data-Harmonization-and-Normalization-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Data Harmonization and Normalization</a></span><ul class="toc-item"><li><span><a href="#Harmonization" data-toc-modified-id="Harmonization-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Harmonization</a></span></li><li><span><a href="#Normalization" data-toc-modified-id="Normalization-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Normalization</a></span></li></ul></li></ul></div>

In [1]:
# Packages

import sys
print("Python version: {}". format(sys.version))

import os
print("OS version: {}". format(os.uname()))

import pandas as pd
print("pandas version: {}". format(pd.__version__))

import matplotlib
print("matplotlib version: {}". format(matplotlib.__version__))
import matplotlib.pyplot as plt

import numpy as np
print("NumPy version: {}". format(np.__version__))

import scipy as sp 
print("SciPy version: {}". format(sp.__version__)) 

import sklearn 
print("scikit-learn version: {}". format(sklearn.__version__))

#misc
import random
import time

print('-'*25)

Python version: 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
OS version: posix.uname_result(sysname='Darwin', nodename='Bings-MacBook-Pro.local', release='17.4.0', version='Darwin Kernel Version 17.4.0: Sun Dec 17 09:19:54 PST 2017; root:xnu-4570.41.2~1/RELEASE_X86_64', machine='x86_64')
pandas version: 0.23.1
matplotlib version: 2.2.2
NumPy version: 1.14.5
SciPy version: 1.1.0
scikit-learn version: 0.19.1
-------------------------


In [2]:
PATH = "AML-DATA/"
masterGen = pd.read_pickle(PATH + "PD-TARGET-AML-GE")
discoveryClin = pd.read_pickle(PATH + "PD-TARGET-AML-Discovery-ClinicalData")
validationClin = pd.read_pickle(PATH + "PD-TARGET-AML-Validation-ClinicalData")


## Gene Expression Data Cleaning

In [3]:
# No null values
masterGen.isnull().sum().sum()

0

In [4]:
# Duplicate Probes or Genes / Rows or Columns
colCount = masterGen.shape[1]
masterGen = masterGen.loc[:,~masterGen.columns.duplicated()]
print("Duplicate patients removed: " + str(colCount - masterGen.shape[1]))


Duplicate patients removed: 0


In [5]:
# masterGen temporarily transposed for ease of column access
masterGen = masterGen.T

# Duplicate Probes or Genes / Rows or Columns
colCount = masterGen.shape[1]
masterGen = masterGen.loc[:,~masterGen.columns.duplicated()]
print("Duplicate probes removed: " + str(colCount - masterGen.shape[1]))


Duplicate probes removed: 0


In [6]:
# No variance
colCount = masterGen.shape[1]
masterGen.drop(masterGen.std()[(masterGen.std() == 0)].index, axis=1)
print("Probes/Columns with 0 variance removed: " + str(colCount - masterGen.shape[1]))
# master Gen un-transposed
masterGen.T;
    

Probes/Columns with 0 variance removed: 0


In [7]:
# Check for null/infinite values (should print 'False')
print(np.any(pd.isnull(masterGen.T)))

False


## Clinical Classification Target Extraction
Relapse - recurrence of a past medical condition

Censored - observation period (study) was cut off before the event occurred (relapse occured)

Note: In the future other clinical features (age, race, gender) can be extracted for use in relapse prediction.

Note2: For now, patients not relapsed or censored are dropped



In [8]:
discoveryClin = discoveryClin.T

validationClin = validationClin.T

In [9]:
discoveryClin = discoveryClin[discoveryClin["First Event"].isin(["Censored", "Relapse"])]
print(discoveryClin["First Event"].value_counts())
print("-"*30)

validationClin = validationClin[validationClin["First Event"].isin(["Censored", "Relapse"])]

print(validationClin["First Event"].value_counts())                                       

Relapse     218
Censored    163
Name: First Event, dtype: int64
------------------------------
Censored    342
Relapse     160
Name: First Event, dtype: int64


In [10]:
masterClin = pd.concat([discoveryClin, validationClin])
print(len(masterClin))
print(masterClin["First Event"].value_counts())


883
Censored    505
Relapse     378
Name: First Event, dtype: int64


In [11]:
from sklearn.preprocessing import LabelBinarizer

labelbinarizer = LabelBinarizer()
masterClin["First Event"] = pd.Series(labelbinarizer.fit_transform(masterClin["First Event"]).transpose()[0],
                                      index = masterClin.index)

## Data Harmonization and Normalization

### Harmonization

In [12]:
masterClin = masterClin[~masterClin.index.duplicated(keep='first')]
len(masterClin)

780

In [13]:
masterClin.columns

Index(['Gender', 'Race', 'Ethnicity', 'Age at Diagnosis in Days',
       'First Event', 'Event Free Survival Time in Days', 'Vital Status',
       'Overall Survival Time in Days', 'Year of Diagnosis',
       'Year of Last Follow Up', 'Protocol', 'WBC at Diagnosis',
       'Bone marrow leukemic blast percentage (%)', 'Peripheral blasts (%)',
       'CNS disease', 'Chloroma', 'FAB Category', 't(6;9)', 't(8;21)',
       't(3;5)(q25;q34)', 't(6;11)(q27;q23)', 't(9;11)(p22;q23)',
       't(10;11)(p11.2;q23)', 't(11:19)(q23:p13.1)', 'inv(16)', 'del5q',
       'del7q', 'del9q', 'monosomy 5', 'monosomy 7', 'trisomy 8', 'trisomy 21',
       'MLL', 'Minus Y', 'Minus X', 'Cytogenetic Code Other',
       'Cytogenetic Complexity', 'Primary Cytogenetic Code', 'ISCN',
       'FLT3/ITD positive?', 'FLT3/ITD allelic ratio', 'FLT3 PM',
       'NPM mutation', 'CEBPA mutation', 'WT1 mutation',
       'c-Kit Mutation Exon 8', 'c-Kit Mutation Exon 17',
       'MRD at end of course 1', 'MRD % at end of cours

In [14]:
masterClin = masterClin[["First Event", "inv(16)", "t(8;21)", "Risk group"]]

In [15]:
#Drop repeated measurements for patient id's, taking the first measurement available
safeList = []
dropList = []

for patid in masterGen.index:
    if(patid[:16] not in safeList):
        safeList.append(patid[:16])
    else:
        dropList.append(patid)
masterGen.drop(dropList, axis = 0, inplace = True)
        
        


In [16]:
masterGen.rename(lambda x: x[0:16], inplace = True)

In [17]:
masterGen = pd.concat([masterGen, masterClin], join = "outer", axis=1, sort=True)

In [18]:
# Now masterGen contains all patients with clinical outcomes and gene expression data

len(masterGen)

masterGen.dropna(axis = 0, inplace = True)

#masterGen = masterGen[masterGen['Risk group'] == "Low"]

In [19]:
print(str(len(masterGen)) + " selected patients")

215 selected patients


In [20]:
# Separating
    
masterClin = masterGen["First Event"]

masterGen.drop(["First Event", "inv(16)", "t(8;21)", "Risk group"], axis = 1, inplace = True)


In [21]:
print(masterClin.value_counts())

1.0    110
0.0    105
Name: First Event, dtype: int64


### Normalization

In [22]:
# Standardization
masterGen = pd.DataFrame((masterGen-masterGen.mean())/masterGen.std(),
                          index = masterGen.index, columns = masterGen.columns)


In [23]:
masterGen.head()

Unnamed: 0,7892501,7892502,7892503,7892504,7892505,7892506,7892507,7892508,7892509,7892510,...,8180409,8180410,8180411,8180412,8180413,8180414,8180415,8180416,8180417,8180418
TARGET-20-PABHET,0.115702,-0.659919,0.990606,0.747772,-1.375789,-0.409011,1.311737,0.535296,-1.791495,0.927586,...,-2.229502,-0.608278,0.076543,-1.519721,-1.316819,0.716646,1.459256,-0.503649,1.087119,-0.128625
TARGET-20-PABHKY,-0.805107,0.122982,-0.753118,0.840004,0.198071,0.036457,0.253967,0.495582,-0.873414,0.904921,...,-0.301883,-1.958429,1.069414,-0.566574,-0.589981,-0.118408,0.479627,-0.355877,-0.853341,-0.730754
TARGET-20-PACDZR,-0.339787,1.371373,2.020408,1.097878,-1.157914,-0.414955,-0.196788,1.296102,-0.554066,1.023062,...,-1.226867,-0.020957,-0.217985,0.398909,0.408032,-0.200669,-0.017515,1.840733,0.542958,-1.363206
TARGET-20-PACEGD,0.333312,1.773869,-0.753256,-0.678328,0.073581,0.105472,0.616729,0.49648,0.493032,0.074393,...,-0.48705,0.546995,-0.041996,-0.146195,-0.186364,-0.612194,-1.198603,-0.285108,0.143702,-0.579833
TARGET-20-PADDXZ,0.038041,-0.605163,-0.19148,0.657912,-0.359776,1.491079,0.093595,1.014715,-0.392931,1.832531,...,-1.033166,-0.498025,-0.33587,-0.115277,-0.129279,0.10096,-0.958352,0.021327,0.157577,-0.529318


In [24]:
masterGen.to_pickle("AML-DATA/PROCESSED-TARGET-AML-X.pkl")
masterClin.to_pickle("AML-DATA/PROCESSED-TARGET-AML-Y.pkl") 