In [14]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tqdm import tqdm
import gc
from numba import jit

In [3]:
def _clean(x, default='float'):   
    non_default = 'int' if default=='float' else 'float'
    try:
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        x.dropna(how='all', axis=1, inplace=True)
        if default=='float':
            x = x * 1.0
        else:
            x = x * 1
    except Exception as e:
        print(e)
        for col in tqdm.tqdm(x.columns):
            if 'object' in str(x[col].dtypes):
                try:
                    x[col] = x[col].astype(default)
                except:
                    try:
                        x[col] = x[col].astype(non_default)
                    except:
                        print(col)
                        x[col] = x[col].astype('category')
    return x

def get_transposed(df, NameRow='GenX', prefix='GenX'):
    transposed  = df.T
    new_index = transposed.loc[[NameRow]].values.tolist()[0]
    transposed.columns = new_index
    if prefix is not None:
        transposed.columns = [prefix+'_'+_col for _col in transposed.columns.values.tolist()]
    return transposed.drop(NameRow, axis=0, inplace=False)

# def _impute(df, type='FM):
#   # use similarity to replace missing values with values for most similar patients.

In [4]:
# loading data

In [5]:
pheno_small = pd.read_csv("../_docs/Lung_Phenotype_Metadata.txt", sep="\t")
pheno_large = pd.read_csv("../_docs/Lung_Table_Phenotypes.txt", sep="\t")
pheno_large.set_index('submitter_id.samples', inplace=True)

In [175]:
sourceDir = "/media/koekiemonster/DATA-FAST/genetic_expression/hackathon_2/"  #"/media/bramvanes/Extra/DATA/RexR/2018" #
gene_expression = pd.read_table(sourceDir+"/Lung/Lung_GeneExpression.txt", sep="\t")

affx_labels = list(gene_expression.set_index('Gene').filter(axis=0, regex=r"^(AFFX.*)").index) # AFFX 
gene_expression = gene_expression.drop(affx_labels, axis=0)

gene_expression = gene_expression[np.isfinite(gene_expression.Start)]
gene_expression = gene_expression[np.isfinite(gene_expression.Stop)]

gene_expression.Start = gene_expression.Start.astype(int).astype(str)
gene_expression.Stop = gene_expression.Stop.astype(int).astype(str)
gene_expression.Chr = gene_expression.Chr.astype(str)
gene_expression.Gene = gene_expression.Gene.astype(str)

gene_expression['GenX'] = gene_expression[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
_map_RNA = gene_expression[['Gene', 'GenX']]
gene_expression = gene_expression.drop(['Gene', 'Chr', 'Start', 'Stop'], axis=1)

# remove duplicate GenX values
gene_expression = gene_expression.copy().loc[~gene_expression.GenX.duplicated(keep='first')].shape

dict_RNA={'RNA_StrandPlus': _clean(get_transposed(gene_expression\
                                                  .loc[gene_expression.Strand=='+'].drop(['Strand'], axis=1))),
          'RNA_StrandMin':  _clean(get_transposed(gene_expression\
                                                  .loc[gene_expression.Strand=='-'].drop(['Strand'], axis=1)))                                
          }


In [7]:
# CLEAN MEMORY
del gene_expression
gc.collect()

7

In [8]:
# merge with phenotypes
pheno_features = ['batch_number', 'code.tissue_source_site', 'sample_type.samples', 'vial_number', 'ethnicity.demographic', 'gender.demographic',
                   'race.demographic', 'year_of_birth.demographic', 'diagnosis']

In [85]:
merged_RNA_min = dict_RNA['RNA_StrandMin'].merge(pheno_large[pheno_features], how='left', left_index=True, right_index=True)
merged_RNA_plus = dict_RNA['RNA_StrandPlus'].merge(pheno_large[pheno_features], how='left', left_index=True, right_index=True)

In [173]:
# There are patients with multiple measurements, we simply take the mean 
gene_columns = [_col for _col in merged_RNA_min.columns if 'GenX' in _col]
other_columns = [_col for _col in merged_RNA_min.columns if 'GenX' not in _col]

merged_RNA_min['patient_nr'] = merged_RNA_min.index
tmp_min = merged_RNA_min[['patient_nr']+gene_columns].groupby(by='patient_nr').mean()
merged_RNA_min = merged_RNA_min.copy()[other_columns].merge(tmp_min, how='right', left_index=True, right_on='patient_nr')
merged_RNA_min = merged_RNA_min.loc[~merged_RNA_min.index.duplicated(keep='first')]

gene_columns = [_col for _col in merged_RNA_plus.columns if 'GenX' in _col]
other_columns = [_col for _col in merged_RNA_plus.columns if 'GenX' not in _col]

merged_RNA_plus['patient_nr'] = merged_RNA_plus.index
tmp_plus = merged_RNA_min[['patient_nr']+gene_columns].groupby(by='patient_nr').mean()
merged_RNA_plus = merged_RNA_plus.copy()[other_columns].merge(tmp_plus, how='right', left_index=True, right_on='patient_nr')
merged_RNA_plus = merged_RNA_plus.loc[~merged_RNA_plus.index.duplicated(keep='first')]

# Bias correction functions
## L/S, cohort-based normalisation

Mean-based $$\mathbf{x}^*=\frac{\mathbf{x}-\overline{\mathbf{x}}}{\sigma}$$


Median-based $$\mathbf{x}^*=\frac{\mathbf{x}-median(\mathbf{x})}{IQR}$$






In [155]:
# L/S
def _preprocess(df, cohorts = [], scaler = "standard", bias_removal = False, col_range = None, min_cohort_size=10, debug=False):
        if col_range is None:
            gene_columns = [_col for _col in df.columns if 'GenX' in _col]  
        else:                      
            cr = range(col_range[0], col_range[1])
            gene_columns = df.columns[cr]
        
        if len(cohorts)==0:
            cohorts = df.batch_number.unique().tolist()
        
        # MinMaxScaler(), MaxAbsScaler(), RobustScaler(), QuantileTransformer(), Normalizer()
        if scaler == "standard":
            scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
        elif scaler == "minmax":
            scaler = preprocessing.MinMaxScaler()
        elif scaler == "maxabs":
            scaler = preprocessing.MaxAbsScaler()
        elif scaler == "robust":
            scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0), 
                                                    with_scaling=True, with_centering=True)
        elif scaler in ["normalizer", "normaliser"]:
            scaler = preprocessing.Normalizer()

        if bias_removal == True:
            print("- "*30, 'Removing cohort biases')
            i=0; itot=len(cohorts)
            
            for cohort in cohorts: # easy to parallelise...
                i+=1
                ch = df['batch_number']==cohort
                if sum(ch)<min_cohort_size:
                    print("Skipping cohort {}, because of low sample count: {}".format(cohort, sum(ch)))
                else:
                    if debug==False:
                        try:
                            # remove nans
                            #null_index = df.loc[ch,gene_columns].isnull()
                            res = scaler.fit_transform(df.loc[ch,gene_columns].values)
                            print("Transformation done..assigning values")
                            df.loc[ch,gene_columns] = pd.DataFrame(data=res, index=ch[ch].index, columns=gene_columns)
                            print("{}/{}, Corrected cohort {}, with {} samples".format(i, itot, cohort, sum(ch))) 
                        except Exception as e:
                            print("ERROR", e, "cohort:"+cohort)
                            print("index:",ch)
                            print("target:", df.loc[ch,gene_columns].shape) 
                            print("replacement:", res.shape)
                    else:
                        for _col in gene_columns: # for debugging
                            df_temp = df.loc[ch, _col].copy() 
                            try:
                                df.loc[ch, _col] = (df_temp-df_temp.mean())/df_temp.std()
                            except Exception as e:
                                print("ERROR", e, "gene:"+_col, "cohort:"+cohort)
        else:
            ch = df["array-batch"].isin(cohorts)
            df.loc[ch,gene_columns] = scaler.fit_transform(df.loc[ch,gene_columns])
            

        df = df[df["batch_number"].isin(cohorts)]
        return df       

In [147]:
merged_RNA_plus_CBC_LS = _preprocess(merged_RNA_plus.copy(), col_range=None, bias_removal=True, debug=False) # 29905

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  Removing cohort biases
Transformation done..assigning values
ERROR Shape of passed values is (30590, 18), indices imply (30578, 18) cohort:52.73.0
index: TCGA-05-4244-01     True
TCGA-05-4249-01     True
TCGA-05-4250-01     True
TCGA-05-4382-01    False
TCGA-05-4384-01    False
TCGA-05-4389-01    False
TCGA-05-4390-01    False
TCGA-05-4395-01    False
TCGA-05-4396-01    False
TCGA-05-4397-01    False
TCGA-05-4398-01    False
TCGA-05-4402-01    False
TCGA-05-4403-01    False
TCGA-05-4405-01    False
TCGA-05-4410-01    False
TCGA-05-4415-01    False
TCGA-05-4417-01    False
TCGA-05-4418-01    False
TCGA-05-4420-01    False
TCGA-05-4422-01    False
TCGA-05-4424-01    False
TCGA-05-4425-01    False
TCGA-05-4426-01    False
TCGA-05-4427-01    False
TCGA-05-4430-01    False
TCGA-05-4432-01    False
TCGA-05-4433-01    False
TCGA-05-4434-01    False
TCGA-05-5420-01    False
TCGA-05-5423-01    False
                   ...  
TCGA-NC-A5

Transformation done..assigning values
ERROR Shape of passed values is (30590, 36), indices imply (30578, 36) cohort:39.67.0
index: TCGA-05-4244-01    False
TCGA-05-4249-01    False
TCGA-05-4250-01    False
TCGA-05-4382-01    False
TCGA-05-4384-01    False
TCGA-05-4389-01    False
TCGA-05-4390-01    False
TCGA-05-4395-01    False
TCGA-05-4396-01    False
TCGA-05-4397-01    False
TCGA-05-4398-01    False
TCGA-05-4402-01    False
TCGA-05-4403-01    False
TCGA-05-4405-01    False
TCGA-05-4410-01    False
TCGA-05-4415-01    False
TCGA-05-4417-01    False
TCGA-05-4418-01    False
TCGA-05-4420-01    False
TCGA-05-4422-01    False
TCGA-05-4424-01    False
TCGA-05-4425-01    False
TCGA-05-4426-01    False
TCGA-05-4427-01    False
TCGA-05-4430-01    False
TCGA-05-4432-01    False
TCGA-05-4433-01    False
TCGA-05-4434-01    False
TCGA-05-5420-01    False
TCGA-05-5423-01    False
                   ...  
TCGA-NC-A5HM-01    False
TCGA-NC-A5HN-01    False
TCGA-NC-A5HO-01    False
TCGA-NC-A5HP-01    

Transformation done..assigning values
ERROR Shape of passed values is (30590, 18), indices imply (30578, 18) cohort:60.69.0
index: TCGA-05-4244-01    False
TCGA-05-4249-01    False
TCGA-05-4250-01    False
TCGA-05-4382-01    False
TCGA-05-4384-01    False
TCGA-05-4389-01    False
TCGA-05-4390-01    False
TCGA-05-4395-01    False
TCGA-05-4396-01    False
TCGA-05-4397-01    False
TCGA-05-4398-01    False
TCGA-05-4402-01    False
TCGA-05-4403-01    False
TCGA-05-4405-01    False
TCGA-05-4410-01    False
TCGA-05-4415-01    False
TCGA-05-4417-01    False
TCGA-05-4418-01    False
TCGA-05-4420-01    False
TCGA-05-4422-01    False
TCGA-05-4424-01    False
TCGA-05-4425-01    False
TCGA-05-4426-01    False
TCGA-05-4427-01    False
TCGA-05-4430-01    False
TCGA-05-4432-01    False
TCGA-05-4433-01    False
TCGA-05-4434-01    False
TCGA-05-5420-01    False
TCGA-05-5423-01    False
                   ...  
TCGA-NC-A5HM-01    False
TCGA-NC-A5HN-01    False
TCGA-NC-A5HO-01    False
TCGA-NC-A5HP-01    

target: (21, 30590)
replacement: (21, 30590)
Transformation done..assigning values
ERROR Shape of passed values is (30590, 11), indices imply (30578, 11) cohort:415.35.0
index: TCGA-05-4244-01    False
TCGA-05-4249-01    False
TCGA-05-4250-01    False
TCGA-05-4382-01    False
TCGA-05-4384-01    False
TCGA-05-4389-01    False
TCGA-05-4390-01    False
TCGA-05-4395-01    False
TCGA-05-4396-01    False
TCGA-05-4397-01    False
TCGA-05-4398-01    False
TCGA-05-4402-01    False
TCGA-05-4403-01    False
TCGA-05-4405-01    False
TCGA-05-4410-01    False
TCGA-05-4415-01    False
TCGA-05-4417-01    False
TCGA-05-4418-01    False
TCGA-05-4420-01    False
TCGA-05-4422-01    False
TCGA-05-4424-01    False
TCGA-05-4425-01    False
TCGA-05-4426-01    False
TCGA-05-4427-01    False
TCGA-05-4430-01    False
TCGA-05-4432-01    False
TCGA-05-4433-01    False
TCGA-05-4434-01    False
TCGA-05-5420-01    False
TCGA-05-5423-01    False
                   ...  
TCGA-NC-A5HM-01    False
TCGA-NC-A5HN-01    Fals

target: (32, 30590)
replacement: (32, 30590)
Transformation done..assigning values
ERROR Shape of passed values is (30590, 44), indices imply (30578, 44) cohort:214.48.0
index: TCGA-05-4244-01    False
TCGA-05-4249-01    False
TCGA-05-4250-01    False
TCGA-05-4382-01    False
TCGA-05-4384-01    False
TCGA-05-4389-01    False
TCGA-05-4390-01    False
TCGA-05-4395-01    False
TCGA-05-4396-01    False
TCGA-05-4397-01    False
TCGA-05-4398-01    False
TCGA-05-4402-01    False
TCGA-05-4403-01    False
TCGA-05-4405-01    False
TCGA-05-4410-01    False
TCGA-05-4415-01    False
TCGA-05-4417-01    False
TCGA-05-4418-01    False
TCGA-05-4420-01    False
TCGA-05-4422-01    False
TCGA-05-4424-01    False
TCGA-05-4425-01    False
TCGA-05-4426-01    False
TCGA-05-4427-01    False
TCGA-05-4430-01    False
TCGA-05-4432-01    False
TCGA-05-4433-01    False
TCGA-05-4434-01    False
TCGA-05-5420-01    False
TCGA-05-5423-01    False
                   ...  
TCGA-NC-A5HM-01    False
TCGA-NC-A5HN-01    Fals

target: (36, 30590)
replacement: (36, 30590)
Transformation done..assigning values
ERROR Shape of passed values is (30590, 50), indices imply (30578, 50) cohort:160.68.0
index: TCGA-05-4244-01    False
TCGA-05-4249-01    False
TCGA-05-4250-01    False
TCGA-05-4382-01    False
TCGA-05-4384-01    False
TCGA-05-4389-01    False
TCGA-05-4390-01    False
TCGA-05-4395-01    False
TCGA-05-4396-01    False
TCGA-05-4397-01    False
TCGA-05-4398-01    False
TCGA-05-4402-01    False
TCGA-05-4403-01    False
TCGA-05-4405-01    False
TCGA-05-4410-01    False
TCGA-05-4415-01    False
TCGA-05-4417-01    False
TCGA-05-4418-01    False
TCGA-05-4420-01    False
TCGA-05-4422-01    False
TCGA-05-4424-01    False
TCGA-05-4425-01    False
TCGA-05-4426-01    False
TCGA-05-4427-01    False
TCGA-05-4430-01    False
TCGA-05-4432-01    False
TCGA-05-4433-01    False
TCGA-05-4434-01    False
TCGA-05-5420-01    False
TCGA-05-5423-01    False
                   ...  
TCGA-NC-A5HM-01    False
TCGA-NC-A5HN-01    Fals

Transformation done..assigning values
ERROR Shape of passed values is (30590, 18), indices imply (30578, 18) cohort:222.59.0
index: TCGA-05-4244-01    False
TCGA-05-4249-01    False
TCGA-05-4250-01    False
TCGA-05-4382-01    False
TCGA-05-4384-01    False
TCGA-05-4389-01    False
TCGA-05-4390-01    False
TCGA-05-4395-01    False
TCGA-05-4396-01    False
TCGA-05-4397-01    False
TCGA-05-4398-01    False
TCGA-05-4402-01    False
TCGA-05-4403-01    False
TCGA-05-4405-01    False
TCGA-05-4410-01    False
TCGA-05-4415-01    False
TCGA-05-4417-01    False
TCGA-05-4418-01    False
TCGA-05-4420-01    False
TCGA-05-4422-01    False
TCGA-05-4424-01    False
TCGA-05-4425-01    False
TCGA-05-4426-01    False
TCGA-05-4427-01    False
TCGA-05-4430-01    False
TCGA-05-4432-01    False
TCGA-05-4433-01    False
TCGA-05-4434-01    False
TCGA-05-5420-01    False
TCGA-05-5423-01    False
                   ...  
TCGA-NC-A5HM-01    False
TCGA-NC-A5HN-01    False
TCGA-NC-A5HO-01    False
TCGA-NC-A5HP-01   

In [None]:
merged_RNA_plus_CBC_LS.mean().mean()

In [None]:
## PCA-shift


## ANOVA 2-way


## Combat
# use R-script, call from PYthon