# Combining omics for classification of phenotypes

Ritchie et al. (Methods of integrating data to uncover genotype-phenotype interactions) describe the following ways to integrate multi-omic data

* Concatenation-based: combine all datasets
* Model-based: create models per datasets, then combine models
* Transformation-based: 

We present a fourth and a fifth multi-omic method
* Reduced normalised concatenation
* Model-based inter-omic transformation

Per sub-omic we collect important features, by
* comparing the non-parametric distributions over the different classifications
* simply counting the occurrences and setting a cut-off point
* using the importances of the classification models as filters

We then have the choice to collect these features
*  greedily
*  non-greedily

To find inter **and** intra-omic connections we can resort to a similarity measure. 

## Load libraries..

In [1]:
import seaborn as sns
from ggplot import *
from matplotlib import pyplot as plt
import bokeh

import pandas as pd
import dask.dataframe as dd
import numpy as np
import scipy as sc
import statsmodels as sm
import networkx as nx

import sklearn as sk
import tensorflow as tf
import keras
import lightgbm as lgbm
import tpot

import sys
import os
import gc


pd.options.display.max_rows = 30
pd.options.display.max_columns = 50
pd.options.display.float_format = '{:.1f}'.format

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load in data..

In [5]:
def read_table(name, loc="gc"):
    if loc=="gc":
        file_root = "https://storage.googleapis.com/genx_2018/"
    elif loc=="local":
        file_root = "/media/koekiemonster/DATA-FAST/genetic_expression/hackathon_2/Melanoma/"
    return pd.read_table(file_root + name, sep="\t")

In [117]:
data_methylation = read_table("Melanoma_Methylation.txt", loc="local")

In [149]:
data_mutation = read_table("Melanoma_Mutation.txt", loc="local")
data_cnv = read_table("Melanoma_CNV.txt", loc="local")
data_RNA = read_table("Melanoma_GeneExpression.txt", loc="local")
data_miRNA = read_table("Melanoma_miRNA.txt", loc="local")
data_protein = read_table("Melanoma_Proteome.txt", loc="local")

  if self.run_code(code, result):
  if self.run_code(code, result):


## create sub-omics 

### Methylation data

In [7]:
# types of mutations, strand, etc.
data_methylation[['Relation_CpG_Island', 'Strand', 'probeID']].\
                                        groupby(by=['Strand', 'Relation_CpG_Island']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,probeID
Strand,Relation_CpG_Island,Unnamed: 2_level_1
+,Island,44246
+,N_Shelf,9788
+,N_Shore,21200
+,S_Shelf,8439
+,S_Shore,15899
-,Island,44514
-,N_Shelf,9191
-,N_Shore,20276
-,S_Shelf,8834
-,S_Shore,16121


In [9]:
# types of mutations, strand, etc.
data_methylation[['Chr', 'probeID']].\
                                        groupby(by=['Chr']).count()

Unnamed: 0_level_0,probeID
Chr,Unnamed: 1_level_1
chr1,46856
chr10,24365
chr11,28794
chr12,24539
chr13,12285
chr14,15074
chr15,15258
chr16,21969
chr17,27876
chr18,5922


In [118]:
data_methylation = data_methylation[np.isfinite(data_methylation.Start)]
data_methylation = data_methylation[np.isfinite(data_methylation.Stop)]

data_methylation.Start = data_methylation.Start.astype(int).astype(str)
data_methylation.Stop = data_methylation.Stop.astype(int).astype(str)
data_methylation.Chr = data_methylation.Chr.astype(str)
data_methylation.Gene = data_methylation.Gene.astype(str)

data_methylation['GenX'] = data_methylation[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
data_methylation.drop('probeID', axis=1, inplace=True)
data_methylation.drop(['Chr', 'Start', 'Stop', 'Gene'], axis=1, inplace=True)
data_methylation.dropna(thresh=4, axis=0, inplace=True)

In [165]:
def get_transposed(df):
    transposed  = df.T
    new_index = transposed.iloc[-1,:].tolist()
    transposed.columns = new_index
    return transposed.drop('GenX', axis=0, inplace=False)

In [120]:
dict_methylation={'Strand_plus_CpG_Island': get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='Island')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_plus_CpG_Nshelf': get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='N_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_plus_CpG_Nshore': get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='N_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),                  
                  'Strand_plus_CpG_Sshelf': get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='S_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_plus_CpG_Sshore': get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='S_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_plus_CpG_NaN': get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island.isna())]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),  
                  'Strand_min_CpG_Island': get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='Island')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_min_CpG_Nshelf': get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='N_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_min_CpG_Nshore': get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='N_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),                  
                  'Strand_min_CpG_Sshelf': get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='S_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_min_CpG_Sshore': get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='S_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),
                  'Strand_min_CpG_NaN': get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island.isna())]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1)),                    
                 }
del data_methylation
gc.collect()

288

### Mutation data

In [170]:
data_mutation = data_mutation[np.isfinite(data_mutation.Start)]
data_mutation = data_mutation[np.isfinite(data_mutation.Stop)]

data_mutation.Start = data_mutation.Start.astype(int).astype(str)
data_mutation.Stop = data_mutation.Stop.astype(int).astype(str)
data_mutation.Chr = data_mutation.Chr.astype(str)
data_mutation.Gene = data_mutation.Gene.astype(str)

data_mutation['GenX'] = data_mutation[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)

In [171]:
_map_mutation = data_mutation[['GenX', 'Ref', 'Alt', 'Amino_Acid_Change', 'Effect']].drop_duplicates()


In [172]:
data_mutation.drop(['Chr', 'Start', 'Stop', 'Gene', 'DNA_VAF', 
                    'RNA_VAF', 'Amino_Acid_Change', 'Ref', 'Alt'], axis=1, inplace=True)

In [173]:
rare_effects = ['Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', 
                'In_Frame_Ins', 'Nonstop_Mutation', 'Translation_Start_Site']
intermediate_effects = ['Splice_Site', 'Nonsense_Mutation']
common_effects = ['Missense_Mutation', 'Silent']

In [128]:
def cat_encode(src, target, col):
    cols = pd.get_dummies(src[col], prefix=col, prefix_sep='_')
    return pd.concat([target, cols], axis=1)

def drop_nan_rows(table, col):
    res = table.dropna(axis=0, how='any', subset=[col])
    print('Dropping %d nan rows for %s, sz before: %d, sz after: %d' % (len(table)-len(res), col, len(table), len(res)))
    return res

def filter(table, col, value):
    res = table.loc[table[col] != value]
    print('Filtering %d rows with "%s" for %s, sz before: %d, sz after: %d' % (len(table) - len(res), value, col, len(table), len(res)))
    return res

def make_x(table, groups, count_filter = 0):
    mg = table \
            .groupby(groups) \
            .size() \
            .reset_index(name='Count') \
            .sort_values(['Count'], ascending=False)
    print('Most mutated genes within {}:\n'.format(groups))
    print(mg.head(10))
        
    # Join groups: Gene_AADACL3|chr1|Translation_Start_Site
    tmp = pd.DataFrame()
    tmp['Sample'] = table['Sample']
    tmp['GenX'] = table[groups].apply(lambda x: '|'.join(x).strip(), axis=1)
    
    if count_filter > 0:
        mg['GenX'] = mg[groups].apply(lambda x: '|'.join(x).strip(), axis=1)
        mg = mg.loc[mg.Count > count_filter]
        unique_genes = mg['GenX']
        print('\nUsing count_filter > %d' % count_filter)
        print('Before: %d' % len(tmp))
        tmp = tmp.loc[tmp.GenX.isin(unique_genes)]
        print('After: %d' % len(tmp))
        
    print('\nUnique features: %d' % len(tmp['GenX'].unique()))
    
    # Extract features
    tmp = cat_encode(tmp, tmp, 'GenX')
    tmp = tmp.loc[:, tmp.columns != 'GenX']
    
    # Group by Sample
    tmp = tmp.groupby('Sample', as_index=False).sum()
 
    # Join targets
    #tmp = tmp.merge(pmt, how='inner', on='Sample')
    
    #y = tmp[target_variable]
    #print('Target: ', y.shape)
    
    tmp = tmp.set_index('Sample')
    #tmp = tmp.loc[:, tmp.columns != 'Sample']
    #tmp = tmp.loc[:, tmp.columns != target_variable]
    x = tmp
    print('Features: ', x.shape)
    
    return x

In [174]:
rare_count = 1
inter_count= 2
common_count = 9
dict_mutation={'Rare': make_x(data_mutation.loc[data_mutation.Effect.isin(rare_effects)].drop(['Effect'], axis=1),
                             ['GenX'], count_filter=rare_count),
               'Intermediate': make_x(data_mutation.loc[data_mutation.Effect.isin(intermediate_effects)].drop(['Effect'],
                                 axis=1), ['GenX'], count_filter=inter_count),
               'Common': make_x(data_mutation.loc[data_mutation.Effect.isin(common_effects)].drop(['Effect'], 
                               axis=1), ['GenX'], count_filter=common_count)                                    
                 }
del data_mutation
gc.collect()

Most mutated genes within ['GenX']:

                                 GenX  Count
2092  NOTCH2.chr1.120612003.120612004      9
2699     RRP36.chr6.42989414.42989419      8
2136    NUDT11.chrX.51239296.51239309      8
3057    SUCO.chr1.172501615.172501615      8
1074   FAM221B.chr9.35826156.35826156      6
2945  SNAPC4.chr9.139277995.139277997      6
1857    MESP2.chr15.90320121.90320144      5
553     CDC27.chr17.45219612.45219612      5
684   CNTNAP3B.chr9.43844265.43844265      5
3290     TSKU.chr11.76506673.76506675      5

Using count_filter > 1
Before: 3954
After: 546

Unique features: 224
Features:  (260, 224)
Most mutated genes within ['GenX']:

                                   GenX  Count
12759      NBPF1.chr1.16918653.16918653     20
4208   CNTNAP2.chr7.146829338.146829338     10
21108    TVP23C.chr17.15441469.15441469      7
20200  TMPRSS11B.chr4.69095234.69095234      7
5374        DCC.chr18.51025778.51025778      6
12474      MYH8.chr17.10304276.10304276      6
5         

812

### CNV data

In [151]:
data_cnv = data_cnv[np.isfinite(data_cnv.Start)]
data_cnv = data_cnv[np.isfinite(data_cnv.Stop)]

data_cnv.Start = data_cnv.Start.astype(int).astype(str)
data_cnv.Stop = data_cnv.Stop.astype(int).astype(str)
data_cnv.Chr = data_cnv.Chr.astype(str)
data_cnv.Gene = data_cnv.Gene.astype(str)

data_cnv['GenX'] = data_cnv[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
_map_cnv = data_cnv[['Gene', 'GenX']]
data_cnv = data_cnv.drop(['Gene', 'Chr', 'Start', 'Stop'], axis=1)

In [175]:
dict_cnv={'StrandPlus': get_transposed(data_cnv.loc[data_cnv.Strand=='+'].drop(['Strand'], axis=1)),
          'StrandMin':  get_transposed(data_cnv.loc[data_cnv.Strand=='-'].drop(['Strand'], axis=1))                                
          }
del data_cnv
gc.collect()

30

### Gene expression data

In [177]:
data_RNA = data_RNA[np.isfinite(data_RNA.Start)]
data_RNA = data_RNA[np.isfinite(data_RNA.Stop)]

data_RNA.Start = data_RNA.Start.astype(int).astype(str)
data_RNA.Stop = data_RNA.Stop.astype(int).astype(str)
data_RNA.Chr = data_RNA.Chr.astype(str)
data_RNA.Gene = data_RNA.Gene.astype(str)

data_RNA['GenX'] = data_RNA[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
_map_RNA = data_RNA[['Gene', 'GenX']]
data_RNA = data_RNA.drop(['Gene', 'Chr', 'Start', 'Stop'], axis=1)


In [178]:
dict_RNA={'StrandPlus': get_transposed(data_RNA.loc[data_RNA.Strand=='+'].drop(['Strand'], axis=1)),
          'StrandMin':  get_transposed(data_RNA.loc[data_RNA.Strand=='-'].drop(['Strand'], axis=1))                                
          }
del data_RNA
gc.collect()

142

### miRNA data

In [183]:
data_miRNA = data_miRNA[np.isfinite(data_miRNA.Start)]
data_miRNA = data_miRNA[np.isfinite(data_miRNA.Stop)]

data_miRNA.Start = data_miRNA.Start.astype(int).astype(str)
data_miRNA.Stop = data_miRNA.Stop.astype(int).astype(str)
data_miRNA.Chr = data_miRNA.Chr.astype(str)
data_miRNA.Name = data_miRNA.Name.astype(str)

data_miRNA['GenX'] = data_miRNA[['Name', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
_map_miRNA = data_miRNA[['Name', 'GenX']]
data_miRNA = data_miRNA.drop(['MIMATID', 'Name', 'Chr', 'Start', 'Stop'], axis=1)


In [185]:
dict_miRNA={'StrandPlus': get_transposed(data_miRNA.loc[data_miRNA.Strand=='+'].drop(['Strand'], axis=1)),
          'StrandMin':  get_transposed(data_miRNA.loc[data_miRNA.Strand=='-'].drop(['Strand'], axis=1))                                
          }
del data_miRNA
gc.collect()

537

In [None]:
# Add miRNA to Gene map??

### Proteomic data

## normalise 

## feature selection 

## merge with phenotype data

## dimension reduction

# Classification

## Per omic

### Per sub omic

## Combined

## Prediction on non-chemo, non-immuno metastastis patients

# Feature analysis