In [1]:
# load in melanoma/lung, map cancer stage to ordinal scale, identify correlating RNA expressions

# MIC, Pearson, Spearman, Distance correlation, monotonic_alignment, pca/fa

import omic_helpers
import numpy as np
import scipy as sc
import pandas as pd
import vaex as vx
import seaborn as sn

import ipyvolume as ipv
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
cm = sn.light_palette("green", as_cmap=True)

import sys, os, gc
from collections import defaultdict

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def plot3d(xlist, title="", names=[]):
    fig = go.Figure(layout={'title': title})
    for idx, x in enumerate(xlist):
        name = names[idx]
        fig.add_trace(go.Scatter3d(
            x=x[:,0],
            y=x[:,1],
            z=x[:,2],
            mode='markers',
            marker={
                'size': 9,
                'opacity': 0.25,
            },
            name=name
        )
        )
    layout = go.Layout(
        margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
    )
    plotly.offline.iplot(fig)

In [4]:
os.chdir('/media/bramiozo/DATA-FAST/genetic_expression/hackathon_2')

In [5]:
ctype='Lung'

In [26]:
ds = {}
ds['RNAex'] = pd.read_csv(ctype+'/'+ctype+'_GeneExpression.txt', sep='\t')
ds['mutation'] = pd.read_csv(ctype+'/'+ctype+'_Mutation.txt', sep='\t')
#ds['methylation'] = vx.open(ctype+'Lung_Methylation.hdf5')
#ds['methylation_meta'] = vx.open(ctype+'HumanMethylation450_meta.hdf5')
#ds['CNV'] = vx.open(ctype+'Lung_CNV.hdf5')

In [27]:
miRNA = pd.read_csv(ctype+'/'+ctype+'_miRNA.txt', sep="\t")
mimamap = miRNA[['MIMATID', 'Name', 'Chr', 'Start', 'Stop', 'Strand']]
miRNA.drop(['Name', 'Chr', 'Start', 'Stop', 'Strand'], axis=1, inplace=True)
miRNA = miRNA.set_index('MIMATID').transpose()

proteome = pd.read_csv(ctype+'/'+ctype+"_Proteome.txt", sep="\t")
#proteome = proteome.set_index('sample').transpose()

meta = pd.read_csv(ctype+'/'+ctype+'_Phenotype_Metadata.txt', sep='\t')
meta = meta[~meta.SampleID.duplicated(keep='last')]
meta['SampleID']  = meta.SampleID.str.replace("\-", "_")
meta.set_index('SampleID', inplace=True)

In [28]:
meta_cols = ['Gender', 'Diagnosis', 'Age At Diagnosis (Years)', 
             'Overall Survival Status', 'Pack Years', 'Smoking Status',
             'Time To Overall Survival (Days)']
meta_cols = meta_cols + ['New Tumor Event', 'Radiation Therapy', 'Reponse To Therapy', 'Drug Therapy Type']

stage_map = {'stage i': 1, 'stage ia': 1.5, 'stage ib': 1.75,
             'stage ii': 2, 'stage iia': 2.5, 'stage iib': 2.75,
             'stage iii': 3,'stage iiia': 3.5, 'stage iiib': 3.75,
             'stage iv' : 4}
meta['Stage'] = meta['Tumor Stage'].map(stage_map)
########
if ctype=='Lung':
    smoke_map = {'Current Reformed Smoker < or = 15 yrs': 'reformed', 
                 'Current Reformed Smoker for > 15 yrs': 'reformed',
                 'Current Reformed Smoker, Duration Not Specified': 'reformed',
                 'Current Smoker': 'current',
                 'Lifelong Non-Smoker': 'non-smoker'}
    meta['Smoking'] = meta['Smoking Status'].map(smoke_map)
########
response_map = {'Progressive Disease': 0,
                'Complete Remission/Response': 1,
                'Stable Disease': 0,
                'Partial Remission/Response': 1}
meta['Response'] = meta['Response To Therapy'].map(response_map)

meta.dropna(subset=['Overall Survival Status'], inplace=True)
meta['Overall Survival Status'] =  meta['Overall Survival Status'].astype(int)

gender_map = {'male': 0, 'female': 1}
meta['Gender'] = meta['Gender'].map(gender_map)

meta_cols = list(set(meta_cols + ['Stage', 'Smoking', 'Response', 'Sample Type']))
meta.reset_index(inplace=True)
meta['SampleID'] = meta.SampleID.apply(lambda x: x.replace("_", "-"))
meta.set_index('SampleID', inplace=True)

In [29]:
ds['RNAex'].Gene = ds['RNAex'].Gene.str.upper()
ds['RNAex'].sort_values(by='Gene', inplace=True)
ds['RNAex'].Start = ds['RNAex'].Start.astype(str)
ds['RNAex'].Stop = ds['RNAex'].Stop.astype(str)
ds['RNAex'].Strand = ds['RNAex'].Strand.astype(str)
ds['RNAex']['rnaID'] = ds['RNAex'][['Gene', 'Chr', 'Start', 'Stop', 'Strand']].apply(lambda x:
                                                                                     "_".join(x), axis=1)
rnamap = ds['RNAex'][['rnaID', 'Gene', 'Chr', 'Start', 'Stop', 'Strand']]
rnamap['rnaID'].reset_index(drop=True, inplace=True)
rnamap.drop_duplicates(subset=['Gene', 'Chr', 'Start', 'Stop', 'Strand'])
ds['RNAex'].drop(['Gene', 'Chr', 'Start', 'Stop', 'Strand'], axis=1, inplace=True)
ds['RNAex'] = ds['RNAex'].set_index('rnaID').transpose()
gc.collect()
rnamap.set_index('rnaID', inplace=True)

## RNA expression diff analysis

In [31]:
X = ds['RNAex'].join(meta[['Stage']], how='inner')

In [32]:
X.dropna(subset=['Stage'], inplace=True)
Y = X[['Stage']]
X.drop(['Stage'], axis=1, inplace=True)

In [36]:
target = 'Stage'
lincorr_dict = dict()
lincorr_dict['spearman']= omic_helpers.spearman_scores(X,Y[target], return_df=True, correction='bonferroni')
lincorr_dict['pearson']= omic_helpers.pearson_scores(X,Y[target], return_df=True, correction='bonferroni')
lincorr_dict['monotonic_aligned'] = omic_helpers.monotonic_alignment(X,Y[[target]], return_df=True)
lincorr_dict['distance_correlation'] = omic_helpers.distcorr(X, Y[[target]], per_column=True,
                                                             return_df=True, columns=['distance_correlation'])
lincorr_dict['mine'] = omic_helpers.mic_scores(X, Y[target], return_df=True)
lincorr = pd.concat(list(lincorr_dict.values()), axis=1)
lincorr.sort_values(by='distance_correlation', ascending=False, inplace=True)

TypeError: spearman_scores() got an unexpected keyword argument 'return_df'

In [None]:
scores = pd.DataFrame(data=lincorr_dict['spearman'], index=X.columns, columns=['score', 'pval'])
scores.dropna(inplace=True)
scores = scores.join(rnamap, how='inner')

In [None]:
scores.sort_values(by='score', ascending=False)[:30]