In [1]:
# importing python modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
from anndata import read_h5ad
from anndata import read_csv
import anndata
from plotnine import * 
from plotnine.data import mtcars
import venn
from scipy import stats
import seaborn as sns
import math
from scipy.stats import pearsonr
from sklearn.metrics import roc_curve, auc, roc_auc_score
import abc
import batchglm.api as glm
import logging
import patsy
import glob
from random import sample
import scipy.sparse
from typing import Union, Dict, Tuple, List, Set
import math
from anndata import AnnData
from typing import Optional, Union, Mapping  # Special
from typing import Sequence, Collection, Iterable  # ABCs
_VarNames = Union[str, Sequence[str]]

%matplotlib inline
sc.logging.print_header()

scanpy==1.8.1 anndata==0.7.5 umap==0.5.1 numpy==1.21.0 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.23.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.2 pynndescent==0.5.2


# Anndata to DF function

In [2]:
from pandas.api.types import is_categorical_dtype

In [3]:
def sanitize_anndata(adata):
    adata._sanitize()

In [4]:
def sc_prepare_dataframe(
    adata: AnnData,
    var_names: Union[_VarNames, Mapping[str, _VarNames]],
    groupby: Optional[str] = None,
    use_raw: Optional[bool] = None,
    log: bool = False,
    num_categories: int = 7,
    layer=None,
    gene_symbols: Optional[str] = None,
):
    """
    Given the anndata object, prepares a data frame in which the row index are the categories
    defined by group by and the columns correspond to var_names.
    Parameters
    ----------
    adata
        Annotated data matrix.
    var_names
        `var_names` should be a valid subset of  `adata.var_names`.
    groupby
        The key of the observation grouping to consider. It is expected that
        groupby is a categorical. If groupby is not a categorical observation,
        it would be subdivided into `num_categories`.
    use_raw
        Use `raw` attribute of `adata` if present.
    log
        Use the log of the values
    num_categories
        Only used if groupby observation is not categorical. This value
        determines the number of groups into which the groupby observation
        should be subdivided.
    gene_symbols
        Key for field in .var that stores gene symbols.
    Returns
    -------
    Tuple of `pandas.DataFrame` and list of categories.
    """
    from scipy.sparse import issparse

    sanitize_anndata(adata)
    if use_raw is None and adata.raw is not None:
        use_raw = True
    if isinstance(var_names, str):
        var_names = [var_names]

    if groupby is not None:
        if groupby not in adata.obs_keys():
            raise ValueError(
                'groupby has to be a valid observation. '
                f'Given {groupby}, valid observations: {adata.obs_keys()}'
            )

    if gene_symbols is not None and gene_symbols in adata.var.columns:
        # translate gene_symbols to var_names
        # slow method but gives a meaningful error if no gene symbol is found:
        translated_var_names = []
        for symbol in var_names:
            if symbol not in adata.var[gene_symbols].values:
                logg.error(
                    f"Gene symbol {symbol!r} not found in given "
                    f"gene_symbols column: {gene_symbols!r}"
                )
                return
            translated_var_names.append(
                adata.var[adata.var[gene_symbols] == symbol].index[0]
            )
        symbols = var_names
        var_names = translated_var_names
    if layer is not None:
        if layer not in adata.layers.keys():
            raise KeyError(
                f'Selected layer: {layer} is not in the layers list. '
                f'The list of valid layers is: {adata.layers.keys()}'
            )
        matrix = adata[:, var_names].layers[layer]
    elif use_raw:
        matrix = adata.raw[:, var_names].X
    else:
        matrix = adata[:, var_names].X

    if issparse(matrix):
        matrix = matrix.toarray()
    if log:
        matrix = np.log1p(matrix)

    obs_tidy = pd.DataFrame(matrix, columns=var_names)
    if groupby is None:
        groupby = ''
        categorical = pd.Series(np.repeat('', len(obs_tidy))).astype('category')
    else:
        if not is_categorical_dtype(adata.obs[groupby]):
            # if the groupby column is not categorical, turn it into one
            # by subdividing into  `num_categories` categories
            categorical = pd.cut(adata.obs[groupby], num_categories)
        else:
            categorical = adata.obs[groupby]

    obs_tidy.set_index(categorical, groupby, inplace=True)
    if gene_symbols is not None:
        # translate the column names to the symbol names
        obs_tidy.rename(
            columns=dict([(var_names[x], symbols[x]) for x in range(len(var_names))]),
            inplace=True,
        )
    categories = obs_tidy.index.categories

    return categories, obs_tidy


# making files for panel H heatmap

In [5]:
adata_processed = sc.read_h5ad('SS2_processed.h5ad')

In [6]:
adata_processed_tumor = adata_processed[adata_processed.obs['sort'] == 'Tumor'].copy()
adata_processed_met = adata_processed[adata_processed.obs['sort'] == 'Metastatic'].copy()

In [7]:
final_met_genes = pd.read_csv('SS2_met_vs_primary_met_genes_list.csv')
final_tumor_genes = pd.read_csv('SS2_met_vs_primary_tumor_genes_list.csv')

In [8]:
low_met_tumors = ['HCI005', 'H3204','H4272']
intermediate_met_tumors = ['HCI009', 'HCI011', 'HCI001']
high_met_tumors = ['H5097', 'J2036', 'J53353', 'HCI010']

In [9]:
def get_met_and_tumor_genes(tumors_list):
    met_genes_list = []
    tumor_genes_list = []
    for i in tumors_list:
        met_genes = [x for x in final_met_genes[i].tolist() if str(x) != 'nan']
        tumor_genes =[x for x in final_tumor_genes[i].tolist() if str(x) != 'nan']
        
        met_genes_list = met_genes_list + met_genes
        tumor_genes_list = tumor_genes_list + tumor_genes
        
    return met_genes_list, tumor_genes_list

In [10]:
low_group_met_genes, low_group_tumor_genes = get_met_and_tumor_genes(low_met_tumors)
intermediate_group_met_genes, intermediate_group_tumor_genes = get_met_and_tumor_genes(intermediate_met_tumors)
high_group_met_genes, high_group_tumor_genes = get_met_and_tumor_genes(high_met_tumors)

In [11]:
low_group_met_genes_overlap = list(set([x for x in low_group_met_genes if low_group_met_genes.count(x) >= 2]))
len(low_group_met_genes_overlap)

74

In [12]:
intermediate_group_met_genes_overlap = list(set([x for x in intermediate_group_met_genes if intermediate_group_met_genes.count(x) >= 2]))
len(intermediate_group_met_genes_overlap)

75

In [13]:
high_group_met_genes_overlap = list(set([x for x in high_group_met_genes if high_group_met_genes.count(x) >= 2]))
len(high_group_met_genes_overlap)

91

In [14]:
low_group_tumor_genes_overlap = list(set([x for x in low_group_tumor_genes if low_group_tumor_genes.count(x) >= 2]))
len(low_group_tumor_genes_overlap)

42

In [15]:
intermediate_group_tumor_genes_overlap = list(set([x for x in intermediate_group_tumor_genes if intermediate_group_tumor_genes.count(x) >= 2]))
len(intermediate_group_tumor_genes_overlap)

48

In [16]:
high_group_tumor_genes_overlap = list(set([x for x in high_group_tumor_genes if high_group_tumor_genes.count(x) >= 2]))
high_group_tumor_genes_overlap.sort()
len(high_group_tumor_genes_overlap)

107

In [17]:

all_genes = low_group_tumor_genes_overlap + intermediate_group_tumor_genes_overlap + high_group_tumor_genes_overlap + low_group_met_genes_overlap + intermediate_group_met_genes_overlap + high_group_met_genes_overlap



In [18]:
len(all_genes)

437

In [19]:
tumor_genes_temp = low_group_tumor_genes_overlap + intermediate_group_tumor_genes_overlap + high_group_tumor_genes_overlap

In [20]:
print(len(tumor_genes_temp), len(set(tumor_genes_temp)))

197 184


In [21]:
tumor_genes = []
for i in tumor_genes_temp:
    if i not in tumor_genes:
        tumor_genes.append(i)

In [22]:
print(len(tumor_genes), len(set(tumor_genes)))

184 184


In [23]:
met_genes = []
for i in low_group_met_genes_overlap + intermediate_group_met_genes_overlap + high_group_met_genes_overlap:
    if i not in tumor_genes and i not in met_genes:
        met_genes.append(i)

In [24]:
print(len(met_genes), len(set(met_genes)))

203 203


In [25]:
tumor_mean_expression_df_tumor_genes = sc_prepare_dataframe(adata_processed_tumor, tumor_genes, 
                                                groupby='Tumor_ID',use_raw=False)

tumor_mean_expression_df_met_genes = sc_prepare_dataframe(adata_processed_tumor, met_genes, 
                                                groupby='Tumor_ID',use_raw=False)

In [26]:
tumor_mean_expression_df_tumor_genes = tumor_mean_expression_df_tumor_genes[1].groupby(level=0).mean()
tumor_mean_expression_df_tumor_genes.index = [i+'_Tumor' for i in tumor_mean_expression_df_tumor_genes.index]
tumor_mean_expression_df_tumor_genes = tumor_mean_expression_df_tumor_genes.T
tumor_mean_expression_df_tumor_genes

Unnamed: 0,H3204_Tumor,H4272_Tumor,H5097_Tumor,H5471_Tumor,HCI001_Tumor,HCI005_Tumor,HCI009_Tumor,HCI010_Tumor,HCI011_Tumor,J2036_Tumor,J53353_Tumor,J55454_Tumor
MIA,1.054257,-0.094895,-0.515723,-0.490777,-0.009934,-0.550780,-0.468149,0.451427,-0.519500,-0.435874,2.390895,0.912826
MTCO3P12,0.205020,-0.578325,-0.217338,0.519994,-0.501458,0.899302,-0.572880,-0.399386,0.165638,-0.194475,-0.620388,-0.402998
FOS,0.368179,0.331056,-0.495493,-0.150410,-0.446348,0.547972,0.691447,0.405549,-0.306294,-0.521619,1.018189,-0.815112
FOSB,-0.077093,0.502476,-0.399718,-0.143761,-0.249705,0.263283,1.636246,0.293211,-0.255109,-0.306235,1.026464,-0.700434
PSAP,1.209783,-0.048173,-0.170041,0.027494,-0.188623,-0.214933,-0.784852,0.679063,-0.640542,-0.754663,0.681798,-0.738920
...,...,...,...,...,...,...,...,...,...,...,...,...
TTC3,0.821387,0.005209,0.329872,-0.081667,-0.053087,-0.309197,0.420792,0.411159,0.010224,-0.392418,0.166740,-0.599221
UBE2S,-0.033137,-0.095995,0.402839,-0.194105,0.492889,-0.154924,0.597202,0.150908,-0.393044,0.175884,0.322959,-0.849954
UBE2T,-0.137108,-0.256136,0.128174,0.278643,-0.198353,0.173682,-0.573169,-0.229880,0.072503,0.831843,-0.142415,-0.259477
UPF3A,0.014838,-0.003780,0.688661,-0.163167,-0.044149,-0.249388,-0.348337,0.148652,0.153639,0.205314,0.002885,-0.172343


In [27]:
tumor_mean_expression_df_met_genes = tumor_mean_expression_df_met_genes[1].groupby(level=0).mean()
tumor_mean_expression_df_met_genes.index = [i+'_Tumor' for i in tumor_mean_expression_df_met_genes.index]
tumor_mean_expression_df_met_genes = tumor_mean_expression_df_met_genes.T
tumor_mean_expression_df_met_genes

Unnamed: 0,H3204_Tumor,H4272_Tumor,H5097_Tumor,H5471_Tumor,HCI001_Tumor,HCI005_Tumor,HCI009_Tumor,HCI010_Tumor,HCI011_Tumor,J2036_Tumor,J53353_Tumor,J55454_Tumor
KLK6,-0.279517,1.103289,-0.362334,0.115497,1.724753,-0.351863,-0.368298,-0.409474,-0.360339,-0.357715,-0.299833,-0.316462
ATP5F1E,-0.178692,-0.485983,0.150675,-0.679658,-0.335030,0.477935,-0.047390,-0.527622,0.864166,-0.072110,-0.462738,0.617462
KLK10,-0.076216,0.447429,-0.301283,-0.017399,1.296210,-0.309651,-0.306902,-0.356821,-0.294747,-0.302560,0.721560,-0.291956
PSMD2,0.750423,-0.352589,0.063236,-0.317893,-0.314088,-0.153564,-0.431455,0.949286,-0.362546,-0.588873,-0.025502,-0.588187
MYL6,-0.121454,-0.387147,0.151227,-0.247266,0.009895,-0.183885,-0.413598,-0.037153,-0.066586,-0.005495,-0.221278,-0.289705
...,...,...,...,...,...,...,...,...,...,...,...,...
NCCRP1,-0.241029,-0.267094,-0.183200,0.057240,0.731607,-0.329719,-0.299470,0.099471,-0.217150,0.647020,-0.321534,-0.260599
MTND2P28,0.688016,-0.456423,-0.154885,0.071799,-0.468068,0.756018,-0.973576,-0.369759,0.351153,-0.450542,-0.637709,-0.541983
MT-RNR1,-0.316745,0.031821,-0.223537,-0.147627,-0.056212,0.479953,-0.125029,-0.121570,0.444519,-0.631877,0.397138,-0.408682
ATF4,0.379092,-0.056931,-0.328615,0.195020,0.321105,-0.508143,-0.208926,0.033179,-0.044847,-0.052275,0.026980,-0.499931


In [28]:
met_mean_expression_df_tumor_genes = sc_prepare_dataframe(adata_processed_met, tumor_genes, 
                                              groupby='Tumor_ID',use_raw=False)

met_mean_expression_df_met_genes = sc_prepare_dataframe(adata_processed_met, met_genes, 
                                              groupby='Tumor_ID',use_raw=False)

In [29]:
met_mean_expression_df_tumor_genes = met_mean_expression_df_tumor_genes[1].groupby(level=0).mean()
met_mean_expression_df_tumor_genes.index = [i+'_Metastatic' for i in met_mean_expression_df_tumor_genes.index]
met_mean_expression_df_tumor_genes = met_mean_expression_df_tumor_genes.T
met_mean_expression_df_tumor_genes

Unnamed: 0,H3204_Metastatic,H4272_Metastatic,H5097_Metastatic,H5471_Metastatic,HCI001_Metastatic,HCI005_Metastatic,HCI009_Metastatic,HCI010_Metastatic,HCI011_Metastatic,J2036_Metastatic,J53353_Metastatic,J55454_Metastatic
MIA,0.182745,-0.422454,-0.554580,-0.468242,0.214220,-0.553441,-0.524600,0.056453,-0.504780,-0.510208,2.452481,0.653001
MTCO3P12,-0.171516,-0.523930,0.010716,0.289744,-0.574881,0.473226,-0.544918,-0.277808,0.521877,0.234049,-0.590249,-0.234549
FOS,-0.378320,0.323859,-0.608683,-0.744802,-0.528071,0.334324,-0.411396,-0.292202,-0.363262,0.657805,0.525040,-0.538204
FOSB,-0.525032,0.250987,-0.370003,-0.654577,-0.456972,0.233608,-0.426195,-0.235872,-0.266734,0.433148,1.093885,-0.689135
PSAP,0.742193,-0.497170,-0.422208,-0.533154,-0.131360,-0.100808,0.656386,0.251784,-0.227071,-0.472349,0.306482,0.153969
...,...,...,...,...,...,...,...,...,...,...,...,...
TTC3,0.249526,-0.212948,-0.111998,-0.850712,-0.026673,-0.442065,0.247206,-0.038411,0.112130,-0.152712,-0.448768,-0.874558
UBE2S,0.177662,-0.110389,0.227396,-0.106810,0.022863,-0.139605,0.176574,0.218422,-0.208592,-0.118553,0.232101,-1.194389
UBE2T,-0.183223,-0.224522,-0.214481,2.062209,-0.058824,-0.011937,-0.599860,-0.473392,0.151079,0.735967,-0.420037,0.829614
UPF3A,0.068329,-0.258067,0.244812,-0.116267,-0.130110,-0.110576,-0.364569,0.130259,-0.025198,-0.117353,0.039206,-0.241597


In [30]:
met_mean_expression_df_met_genes = met_mean_expression_df_met_genes[1].groupby(level=0).mean()
met_mean_expression_df_met_genes.index = [i+'_Metastatic' for i in met_mean_expression_df_met_genes.index]
met_mean_expression_df_met_genes = met_mean_expression_df_met_genes.T
met_mean_expression_df_met_genes

Unnamed: 0,H3204_Metastatic,H4272_Metastatic,H5097_Metastatic,H5471_Metastatic,HCI001_Metastatic,HCI005_Metastatic,HCI009_Metastatic,HCI010_Metastatic,HCI011_Metastatic,J2036_Metastatic,J53353_Metastatic,J55454_Metastatic
KLK6,0.012396,2.399886,-0.340623,0.366464,1.027850,-0.344979,-0.371863,-0.440721,-0.358823,-0.352224,-0.052365,0.692365
ATP5F1E,0.002737,-0.074882,0.227563,-0.869340,-0.318785,1.071992,0.348010,-0.117859,1.189555,-0.691123,-0.283464,-0.290559
KLK10,0.154378,0.996193,-0.303007,-0.286282,0.559805,-0.304326,-0.315893,-0.379519,-0.316438,-0.300041,1.244231,0.741055
PSMD2,0.270299,-0.116260,0.396982,-0.524665,-0.502655,0.306115,0.121599,0.679905,-0.093696,0.021332,-0.256621,-0.377118
MYL6,0.353541,0.211051,0.563995,1.090005,-0.592277,0.383617,0.690227,0.127016,1.022755,-0.009934,-0.177169,-0.457528
...,...,...,...,...,...,...,...,...,...,...,...,...
NCCRP1,-0.233277,-0.117706,0.343004,-0.278021,-0.042526,-0.335682,-0.273425,-0.088682,0.345975,2.300912,-0.316435,-0.016677
MTND2P28,0.314599,-0.253675,0.090479,-0.602987,-0.690683,0.509813,-0.951570,-0.109597,0.745445,0.562500,-0.498123,-0.862967
MT-RNR1,-0.716181,-0.023042,0.025928,-0.922881,-0.571493,0.129443,-0.016051,0.161306,0.626675,0.409105,0.356534,-0.659136
ATF4,0.466943,0.341045,-0.162548,0.536500,-0.205812,-0.557310,-0.135804,0.435082,0.000758,0.694586,0.126455,0.052034


In [31]:
global_mean_expression_df_tumor_genes = sc_prepare_dataframe(adata_processed, tumor_genes
                                                 , groupby='sort',use_raw=False)

global_mean_expression_df_met_genes = sc_prepare_dataframe(adata_processed, met_genes
                                                 , groupby='sort',use_raw=False)


In [32]:
global_mean_expression_df_tumor_genes = global_mean_expression_df_tumor_genes[1].groupby(level=0).mean()
global_mean_expression_df_tumor_genes.index = ['global_'+i for i in global_mean_expression_df_tumor_genes.index]
global_mean_expression_df_tumor_genes = global_mean_expression_df_tumor_genes.T
global_mean_expression_df_tumor_genes

Unnamed: 0,global_Metastatic,global_Tumor
MIA,-0.138887,0.069195
MTCO3P12,-0.107247,0.053431
FOS,-0.159535,0.079481
FOSB,-0.118466,0.059021
PSAP,-0.038906,0.019383
...,...,...
TTC3,-0.095037,0.047348
UBE2S,0.038540,-0.019201
UBE2T,-0.119048,0.059311
UPF3A,-0.024947,0.012429


In [33]:
global_mean_expression_df_met_genes = global_mean_expression_df_met_genes[1].groupby(level=0).mean()
global_mean_expression_df_met_genes.index = ['global_'+i for i in global_mean_expression_df_met_genes.index]
global_mean_expression_df_met_genes = global_mean_expression_df_met_genes.T
global_mean_expression_df_met_genes

Unnamed: 0,global_Metastatic,global_Tumor
KLK6,0.137192,-0.068350
ATP5F1E,0.180831,-0.090091
KLK10,0.042858,-0.021352
PSMD2,0.125661,-0.062605
MYL6,0.279771,-0.139384
...,...,...
NCCRP1,0.102138,-0.050886
MTND2P28,0.003742,-0.001864
MT-RNR1,-0.002428,0.001210
ATF4,0.066471,-0.033116


In [34]:
combined_tumor_tumor_genes = pd.concat([tumor_mean_expression_df_tumor_genes, global_mean_expression_df_tumor_genes[['global_Tumor']]],axis=1)
combined_tumor_tumor_genes

Unnamed: 0,H3204_Tumor,H4272_Tumor,H5097_Tumor,H5471_Tumor,HCI001_Tumor,HCI005_Tumor,HCI009_Tumor,HCI010_Tumor,HCI011_Tumor,J2036_Tumor,J53353_Tumor,J55454_Tumor,global_Tumor
MIA,1.054257,-0.094895,-0.515723,-0.490777,-0.009934,-0.550780,-0.468149,0.451427,-0.519500,-0.435874,2.390895,0.912826,0.069195
MTCO3P12,0.205020,-0.578325,-0.217338,0.519994,-0.501458,0.899302,-0.572880,-0.399386,0.165638,-0.194475,-0.620388,-0.402998,0.053431
FOS,0.368179,0.331056,-0.495493,-0.150410,-0.446348,0.547972,0.691447,0.405549,-0.306294,-0.521619,1.018189,-0.815112,0.079481
FOSB,-0.077093,0.502476,-0.399718,-0.143761,-0.249705,0.263283,1.636246,0.293211,-0.255109,-0.306235,1.026464,-0.700434,0.059021
PSAP,1.209783,-0.048173,-0.170041,0.027494,-0.188623,-0.214933,-0.784852,0.679063,-0.640542,-0.754663,0.681798,-0.738920,0.019383
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTC3,0.821387,0.005209,0.329872,-0.081667,-0.053087,-0.309197,0.420792,0.411159,0.010224,-0.392418,0.166740,-0.599221,0.047348
UBE2S,-0.033137,-0.095995,0.402839,-0.194105,0.492889,-0.154924,0.597202,0.150908,-0.393044,0.175884,0.322959,-0.849954,-0.019201
UBE2T,-0.137108,-0.256136,0.128174,0.278643,-0.198353,0.173682,-0.573169,-0.229880,0.072503,0.831843,-0.142415,-0.259477,0.059311
UPF3A,0.014838,-0.003780,0.688661,-0.163167,-0.044149,-0.249388,-0.348337,0.148652,0.153639,0.205314,0.002885,-0.172343,0.012429


In [35]:
combined_tumor_met_genes = pd.concat([tumor_mean_expression_df_met_genes, global_mean_expression_df_met_genes[['global_Tumor']]],axis=1)
combined_tumor_met_genes

Unnamed: 0,H3204_Tumor,H4272_Tumor,H5097_Tumor,H5471_Tumor,HCI001_Tumor,HCI005_Tumor,HCI009_Tumor,HCI010_Tumor,HCI011_Tumor,J2036_Tumor,J53353_Tumor,J55454_Tumor,global_Tumor
KLK6,-0.279517,1.103289,-0.362334,0.115497,1.724753,-0.351863,-0.368298,-0.409474,-0.360339,-0.357715,-0.299833,-0.316462,-0.068350
ATP5F1E,-0.178692,-0.485983,0.150675,-0.679658,-0.335030,0.477935,-0.047390,-0.527622,0.864166,-0.072110,-0.462738,0.617462,-0.090091
KLK10,-0.076216,0.447429,-0.301283,-0.017399,1.296210,-0.309651,-0.306902,-0.356821,-0.294747,-0.302560,0.721560,-0.291956,-0.021352
PSMD2,0.750423,-0.352589,0.063236,-0.317893,-0.314088,-0.153564,-0.431455,0.949286,-0.362546,-0.588873,-0.025502,-0.588187,-0.062605
MYL6,-0.121454,-0.387147,0.151227,-0.247266,0.009895,-0.183885,-0.413598,-0.037153,-0.066586,-0.005495,-0.221278,-0.289705,-0.139384
...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCCRP1,-0.241029,-0.267094,-0.183200,0.057240,0.731607,-0.329719,-0.299470,0.099471,-0.217150,0.647020,-0.321534,-0.260599,-0.050886
MTND2P28,0.688016,-0.456423,-0.154885,0.071799,-0.468068,0.756018,-0.973576,-0.369759,0.351153,-0.450542,-0.637709,-0.541983,-0.001864
MT-RNR1,-0.316745,0.031821,-0.223537,-0.147627,-0.056212,0.479953,-0.125029,-0.121570,0.444519,-0.631877,0.397138,-0.408682,0.001210
ATF4,0.379092,-0.056931,-0.328615,0.195020,0.321105,-0.508143,-0.208926,0.033179,-0.044847,-0.052275,0.026980,-0.499931,-0.033116


In [36]:
combined_met_tumor_genes = pd.concat([met_mean_expression_df_tumor_genes, global_mean_expression_df_tumor_genes[['global_Metastatic']]],axis=1)
combined_met_tumor_genes

Unnamed: 0,H3204_Metastatic,H4272_Metastatic,H5097_Metastatic,H5471_Metastatic,HCI001_Metastatic,HCI005_Metastatic,HCI009_Metastatic,HCI010_Metastatic,HCI011_Metastatic,J2036_Metastatic,J53353_Metastatic,J55454_Metastatic,global_Metastatic
MIA,0.182745,-0.422454,-0.554580,-0.468242,0.214220,-0.553441,-0.524600,0.056453,-0.504780,-0.510208,2.452481,0.653001,-0.138887
MTCO3P12,-0.171516,-0.523930,0.010716,0.289744,-0.574881,0.473226,-0.544918,-0.277808,0.521877,0.234049,-0.590249,-0.234549,-0.107247
FOS,-0.378320,0.323859,-0.608683,-0.744802,-0.528071,0.334324,-0.411396,-0.292202,-0.363262,0.657805,0.525040,-0.538204,-0.159535
FOSB,-0.525032,0.250987,-0.370003,-0.654577,-0.456972,0.233608,-0.426195,-0.235872,-0.266734,0.433148,1.093885,-0.689135,-0.118466
PSAP,0.742193,-0.497170,-0.422208,-0.533154,-0.131360,-0.100808,0.656386,0.251784,-0.227071,-0.472349,0.306482,0.153969,-0.038906
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTC3,0.249526,-0.212948,-0.111998,-0.850712,-0.026673,-0.442065,0.247206,-0.038411,0.112130,-0.152712,-0.448768,-0.874558,-0.095037
UBE2S,0.177662,-0.110389,0.227396,-0.106810,0.022863,-0.139605,0.176574,0.218422,-0.208592,-0.118553,0.232101,-1.194389,0.038540
UBE2T,-0.183223,-0.224522,-0.214481,2.062209,-0.058824,-0.011937,-0.599860,-0.473392,0.151079,0.735967,-0.420037,0.829614,-0.119048
UPF3A,0.068329,-0.258067,0.244812,-0.116267,-0.130110,-0.110576,-0.364569,0.130259,-0.025198,-0.117353,0.039206,-0.241597,-0.024947


In [37]:
combined_met_met_genes = pd.concat([met_mean_expression_df_met_genes, global_mean_expression_df_met_genes[['global_Metastatic']]],axis=1)
combined_met_met_genes

Unnamed: 0,H3204_Metastatic,H4272_Metastatic,H5097_Metastatic,H5471_Metastatic,HCI001_Metastatic,HCI005_Metastatic,HCI009_Metastatic,HCI010_Metastatic,HCI011_Metastatic,J2036_Metastatic,J53353_Metastatic,J55454_Metastatic,global_Metastatic
KLK6,0.012396,2.399886,-0.340623,0.366464,1.027850,-0.344979,-0.371863,-0.440721,-0.358823,-0.352224,-0.052365,0.692365,0.137192
ATP5F1E,0.002737,-0.074882,0.227563,-0.869340,-0.318785,1.071992,0.348010,-0.117859,1.189555,-0.691123,-0.283464,-0.290559,0.180831
KLK10,0.154378,0.996193,-0.303007,-0.286282,0.559805,-0.304326,-0.315893,-0.379519,-0.316438,-0.300041,1.244231,0.741055,0.042858
PSMD2,0.270299,-0.116260,0.396982,-0.524665,-0.502655,0.306115,0.121599,0.679905,-0.093696,0.021332,-0.256621,-0.377118,0.125661
MYL6,0.353541,0.211051,0.563995,1.090005,-0.592277,0.383617,0.690227,0.127016,1.022755,-0.009934,-0.177169,-0.457528,0.279771
...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCCRP1,-0.233277,-0.117706,0.343004,-0.278021,-0.042526,-0.335682,-0.273425,-0.088682,0.345975,2.300912,-0.316435,-0.016677,0.102138
MTND2P28,0.314599,-0.253675,0.090479,-0.602987,-0.690683,0.509813,-0.951570,-0.109597,0.745445,0.562500,-0.498123,-0.862967,0.003742
MT-RNR1,-0.716181,-0.023042,0.025928,-0.922881,-0.571493,0.129443,-0.016051,0.161306,0.626675,0.409105,0.356534,-0.659136,-0.002428
ATF4,0.466943,0.341045,-0.162548,0.536500,-0.205812,-0.557310,-0.135804,0.435082,0.000758,0.694586,0.126455,0.052034,0.066471


In [38]:
new_order_tumor = ['HCI005_Tumor','H3204_Tumor','H4272_Tumor','HCI009_Tumor',
            'HCI011_Tumor','HCI001_Tumor','H5097_Tumor','J2036_Tumor','J53353_Tumor','HCI010_Tumor','global_Tumor']

new_order_met = ['HCI005_Metastatic','H3204_Metastatic','H4272_Metastatic','HCI009_Metastatic',
            'HCI011_Metastatic','HCI001_Metastatic','H5097_Metastatic','J2036_Metastatic','J53353_Metastatic','HCI010_Metastatic','global_Metastatic']

In [39]:
combined_tumor_tumor_genes = combined_tumor_tumor_genes[new_order_tumor]
combined_tumor_met_genes = combined_tumor_met_genes[new_order_tumor]

combined_met_tumor_genes = combined_met_tumor_genes[new_order_met]
combined_met_met_genes = combined_met_met_genes[new_order_met]

In [40]:
combined_tumor_tumor_genes.to_csv('tumor_mean_tumor_genes_gc.csv')
combined_tumor_met_genes.to_csv('tumor_mean_met_genes_gc.csv')

combined_met_tumor_genes.to_csv('met_mean_tumor_genes_gc.csv')
combined_met_met_genes.to_csv('met_mean_met_genes_gc.csv')

In [41]:
metadata = pd.DataFrame(index=new_order_tumor)
metadata['Tumor_ID'] = [i.split('_')[0] for i in metadata.index]
metadata['sort'] = [i.split('_')[1] for i in metadata.index]
for i in metadata.index:
    tumor_id = metadata.loc[i, 'Tumor_ID']
    if tumor_id in low_met_tumors:
        metadata.loc[i, 'metastatic_potential_group'] = 'low'
    elif tumor_id in intermediate_met_tumors+['H5471']:
        metadata.loc[i, 'metastatic_potential_group'] = 'moderate'
    elif tumor_id in high_met_tumors:
        metadata.loc[i, 'metastatic_potential_group'] = 'high'
    else:
        metadata.loc[i, 'metastatic_potential_group'] = 'all'
metadata

Unnamed: 0,Tumor_ID,sort,metastatic_potential_group
HCI005_Tumor,HCI005,Tumor,low
H3204_Tumor,H3204,Tumor,low
H4272_Tumor,H4272,Tumor,low
HCI009_Tumor,HCI009,Tumor,moderate
HCI011_Tumor,HCI011,Tumor,moderate
HCI001_Tumor,HCI001,Tumor,moderate
H5097_Tumor,H5097,Tumor,high
J2036_Tumor,J2036,Tumor,high
J53353_Tumor,J53353,Tumor,high
HCI010_Tumor,HCI010,Tumor,high


In [42]:
metadata.to_csv('tumor_metadata.csv')

In [43]:
metadata = pd.DataFrame(index=new_order_met)
metadata['Tumor_ID'] = [i.split('_')[0] for i in metadata.index]
metadata['sort'] = [i.split('_')[1] for i in metadata.index]
for i in metadata.index:
    tumor_id = metadata.loc[i, 'Tumor_ID']
    if tumor_id in low_met_tumors:
        metadata.loc[i, 'metastatic_potential_group'] = 'low'
    elif tumor_id in intermediate_met_tumors+['H5471']:
        metadata.loc[i, 'metastatic_potential_group'] = 'moderate'
    elif tumor_id in high_met_tumors:
        metadata.loc[i, 'metastatic_potential_group'] = 'high'
    else:
        metadata.loc[i, 'metastatic_potential_group'] = 'all'
metadata

Unnamed: 0,Tumor_ID,sort,metastatic_potential_group
HCI005_Metastatic,HCI005,Metastatic,low
H3204_Metastatic,H3204,Metastatic,low
H4272_Metastatic,H4272,Metastatic,low
HCI009_Metastatic,HCI009,Metastatic,moderate
HCI011_Metastatic,HCI011,Metastatic,moderate
HCI001_Metastatic,HCI001,Metastatic,moderate
H5097_Metastatic,H5097,Metastatic,high
J2036_Metastatic,J2036,Metastatic,high
J53353_Metastatic,J53353,Metastatic,high
HCI010_Metastatic,HCI010,Metastatic,high


In [44]:
metadata.to_csv('met_metadata.csv')

In [45]:
gene_metadata = pd.DataFrame()
gene_metadata['gene'] = tumor_genes
gene_metadata = gene_metadata.set_index('gene')
gene_metadata['sort'] = 'tumor'
for i in tumor_genes:
    if i in low_group_tumor_genes_overlap:
        gene_metadata.loc[i, 'group'] = 'low'
    elif i in intermediate_group_tumor_genes_overlap:
        gene_metadata.loc[i, 'group'] = 'moderate'
    elif i in high_group_tumor_genes_overlap:
        gene_metadata.loc[i, 'group'] = 'high'

gene_metadata

Unnamed: 0_level_0,sort,group
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
MIA,tumor,low
MTCO3P12,tumor,low
FOS,tumor,low
FOSB,tumor,low
PSAP,tumor,low
...,...,...
TTC3,tumor,high
UBE2S,tumor,high
UBE2T,tumor,high
UPF3A,tumor,high


In [46]:
gene_metadata.to_csv('tumor_genes_metadata.csv')

In [47]:
gene_metadata = pd.DataFrame()
gene_metadata['gene'] = met_genes
gene_metadata = gene_metadata.set_index('gene')
gene_metadata['sort'] = 'tumor'
for i in met_genes:
    if i in low_group_met_genes_overlap:
        gene_metadata.loc[i, 'group'] = 'low'
    elif i in intermediate_group_met_genes_overlap:
        gene_metadata.loc[i, 'group'] = 'moderate'
    elif i in high_group_met_genes_overlap:
        gene_metadata.loc[i, 'group'] = 'high'

gene_metadata

Unnamed: 0_level_0,sort,group
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
KLK6,tumor,low
ATP5F1E,tumor,low
KLK10,tumor,low
PSMD2,tumor,low
MYL6,tumor,low
...,...,...
NCCRP1,tumor,high
MTND2P28,tumor,high
MT-RNR1,tumor,high
ATF4,tumor,high


In [48]:
gene_metadata.to_csv('met_genes_metadata.csv')