**PharmacoDB:** https://pharmacodb.pmgenomics.ca/datasets/4<br>
PharmacoGx do not mention which normalization is used when query `rnaseq` data using `molecularProfiles()`.
Assuming that http://research-pub.gene.com/gCSI-cellline-data/ is the correct data resource, then the data is VSD (i.e. generated using DESeq from read count data). In this case, we can't simply integrate gCSI RNA-Seq into our combined dataframes.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
from glob import glob
from collections import OrderedDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Utils
file_path = os.getcwd()  # os.path.dirname(os.path.relpath(__file__))
utils_path = os.path.abspath(os.path.join(file_path, 'utils_py'))
sys.path.append(utils_path)
import utils_all as utils

import warnings
warnings.filterwarnings('ignore')

SEED=0

Using TensorFlow backend.


In [2]:
DATADIR = '/Users/apartin/Dropbox/work/pilot1/pharmaco/gcsi'

In [3]:
files = glob(os.path.join(DATADIR, '*'))
fdict = {os.path.basename(f): pd.read_csv(f, sep='\t') for f in files}
fdict = OrderedDict(sorted(fdict.items(), key=lambda x: x[0]))
list(fdict.keys())

['gcsi_cellmeta',
 'gcsi_drugmeta',
 'gcsi_feameta',
 'gcsi_phenometa',
 'gcsi_rnaseq',
 'gcsi_rspdata',
 'gcsi_sensnum']

In [4]:
cellmeta = fdict['gcsi_cellmeta']
drugmeta = fdict['gcsi_drugmeta']
feameta = fdict['gcsi_feameta']
phenometa = fdict['gcsi_phenometa']
rnaseq = fdict['gcsi_rnaseq']
rspdata = fdict['gcsi_rspdata']
sensnum = fdict['gcsi_sensnum']

In [5]:
feameta = feameta.reset_index().rename(columns={'index': 'ENSG_gene_id'})
rnaseq = rnaseq.reset_index().rename(columns={'index': 'ENSG_gene_id'})

# Expression data

In [6]:
print(rnaseq.shape)
display(rnaseq.iloc[:2, :7])

(22684, 323)


Unnamed: 0,ENSG_gene_id,23132-87,769-P,786-0,A172,A2058,A2780
0,ENSG00000000003_at,-0.257433,0.007633,-0.479304,-1.95416,-0.003535,0.835721
1,ENSG00000000005_at,-0.429652,1.864555,-0.429652,-0.429652,-0.429652,4.250555


In [7]:
print(feameta.shape)
display(feameta.iloc[:2,])

(22684, 4)


Unnamed: 0,ENSG_gene_id,Symbol,GeneID,Location
0,ENSG00000000003_at,TSPAN6,GeneID:7105,chrX:99883795-99891794
1,ENSG00000000005_at,TNMD,GeneID:64102,chrX:99839790-99854882


In [8]:
rnaseq = feameta[['ENSG_gene_id', 'GeneID']].merge(rnaseq, on='ENSG_gene_id').drop(columns=['ENSG_gene_id'])

In [9]:
print(rnaseq.shape)
display(rnaseq.iloc[:2, :7])

(22684, 323)


Unnamed: 0,GeneID,23132-87,769-P,786-0,A172,A2058,A2780
0,GeneID:7105,-0.257433,0.007633,-0.479304,-1.95416,-0.003535,0.835721
1,GeneID:64102,-0.429652,1.864555,-0.429652,-0.429652,-0.429652,4.250555


In [11]:
rnaseq.sort_values('GeneID').reset_index(drop=True).iloc[:4, :7]

Unnamed: 0,GeneID,23132-87,769-P,786-0,A172,A2058,A2780
0,GeneID:1,-2.868397,-1.457918,-1.546713,1.484096,0.754382,-0.072495
1,GeneID:10,1.660978,0.821055,0.107958,0.360472,-1.020219,0.329382
2,GeneID:100,-0.317083,-0.720061,-0.270618,-0.444273,-0.225363,-0.257863
3,GeneID:1000,-1.024843,1.520102,1.774583,2.360834,1.094753,0.113761
