# Original gCSI data for Klijn 2015

- Cell line annotations:  `'E-MTAB-2706.sdrf.txt'` --> how to map these to our names from combined_rnaseq_data (which the same in pharmacogx)?<br>
- Gene annotations:  `'140625_Klijn_geneToTranscript.txt'`<br>
- Drug annotations:  ?? <br>

RNA-seq dataset size (**ArrayExpress** and **Klijn**): [26094 genes, 675 samples]<br>
RNA-seq dataset size (**PharmacoDB**): [22684 genes, 410 samples] --> after removing NA [22684 genes, 322 samples] (only 410 cell lines were screened with drugs; only ~330 cell lines were sequenced)

- **PharmacoDB** contains the RPKM values.
- The values in the RNA-Seq gene expression tables (Counts, RPKM, VSD) are consistent across **ArrayExpress** and **Klijn**. The difference is in the cell line sample names/identifiers (columns). **ArrayExpress** uses sample names such as *Sample 290* while **Klijn** uses the conventional cell line names such as *NCI-H358* (https://www.ncbi.nlm.nih.gov/biosample/9556749).<br> Note! The https://www.ncbi.nlm.nih.gov/biosample/9556749 provides various identifiers including `BioSample`, `Sample name`, and `SRA`. The cell line name mappings between `ArrayExpress` and `Klijn` are not consistent with the identifiers in the NCBI website. For example: in the gCSI resources, *NCI-H358* is mapped to *Sample 290*. In the NCBI, however, *NCI-H358* is mapped to *Sample 6*.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Utils
file_path = os.getcwd()  # os.path.dirname(os.path.relpath(__file__))
utils_path = os.path.abspath(os.path.join(file_path, 'utils_py'))
sys.path.append(utils_path)
import utils_all as utils

import warnings
warnings.filterwarnings('ignore')

SEED=0

Using TensorFlow backend.


In [2]:
DATADIR = '/Users/apartin/work/jdacs/cell-line-data/gcsi/klijn_2015/original'

## RNA-Seq count data (coding genes)

In [3]:
# Klijn
datapath = os.path.join(DATADIR, 'Klijn', '140331_countsExport.txt')
df_cnt1 = pd.read_csv(datapath, sep='\t')
df_cnt1.rename(columns={'GeneID': 'gene_id'}, inplace=True)
print(df_cnt1.shape)

# ArrayExpress
datapath = os.path.join(DATADIR, 'ArrayExpress/E-MTAB-2706.additional.1', '140625_Klijn_counts_coding.txt')
df_cnt2 = pd.read_csv(datapath, sep='\t')
df_cnt2.rename(columns={'geneID': 'gene_id'}, inplace=True)
print(df_cnt2.shape)

# display(df_cnt1[:2])

(26094, 676)
(26094, 676)


In [4]:
# Consistent in count values
print((df_cnt1.values - df_cnt2.values).sum())

# Inconsistent in column (sample) names
print(df_cnt1.equals(df_cnt2))
print(df_cnt1.columns.tolist()[:5])
print(df_cnt2.columns.tolist()[:5])

0
False
['gene_id', 'NCI-H358', 'NCI-H292', 'NCI-H522', 'NCI-H650']
['gene_id', 'Sample 290', 'Sample 289', 'Sample 295', 'Sample 297']


## RNA-Seq RPKM data (coding genes)

In [5]:
# Klijn
datapath = os.path.join(DATADIR, 'Klijn', '140331_RPKMExport.txt')
df_rpkm1 = pd.read_csv(datapath, sep='\t')
df_rpkm1.rename(columns={'GeneID': 'gene_id'}, inplace=True)
print(df_rpkm1.shape)

# ArrayExpress
datapath = os.path.join(DATADIR, 'ArrayExpress/E-MTAB-2706.additional.1', '140625_Klijn_RPKM_coding.txt')
df_rpkm2 = pd.read_csv(datapath, sep='\t')
df_rpkm2.rename(columns={'geneID': 'gene_id'}, inplace=True)
print(df_rpkm2.shape)

# display(df_rpkm1[:2])

(26094, 676)
(26094, 676)


In [6]:
# Consistent in count values
print((df_rpkm1.values - df_rpkm2.values).sum())

# Inconsistent in column (sample) names
print(df_rpkm1.equals(df_rpkm2))
print(df_rpkm1.columns.tolist()[:5])
print(df_rpkm2.columns.tolist()[:5])

0.0
False
['gene_id', 'NCI-H358', 'NCI-H292', 'NCI-H522', 'NCI-H650']
['gene_id', 'Sample 290', 'Sample 289', 'Sample 295', 'Sample 297']


## RNA-Seq VSD (DESeq) data (coding genes)

In [7]:
# Klijn
datapath = os.path.join(DATADIR, 'Klijn', '140331_VSDexport.txt')
df_vsd1 = pd.read_csv(datapath, sep='\t')
df_vsd1.rename(columns={'GeneID': 'gene_id'}, inplace=True)
print(df_vsd1.shape)

# ArrayExpress
datapath = os.path.join(DATADIR, 'ArrayExpress/E-MTAB-2706.additional.2', '140625_Klijn_VSD_coding.txt')
df_vsd2 = pd.read_csv(datapath, sep='\t')
df_vsd2.rename(columns={'geneID': 'gene_id'}, inplace=True)
print(df_vsd2.shape)

# display(df_vsd1[:2])

(26094, 676)
(26094, 676)


In [8]:
# Consistent in count values
print((df_vsd1.values - df_vsd2.values).sum())

# Inconsistent in column (sample) names
print(df_vsd1.equals(df_vsd2))
print(df_vsd1.columns.tolist()[:5])
print(df_vsd2.columns.tolist()[:5])

0.0
False
['gene_id', 'NCI-H358', 'NCI-H292', 'NCI-H522', 'NCI-H650']
['gene_id', 'Sample 290', 'Sample 289', 'Sample 295', 'Sample 297']


### Sort sample names

In [9]:
def sort_sample_names(df):
    df = df.set_index('gene_id')
    df = df[df.columns[np.argsort(df.columns)]]  
    return df

In [10]:
df_cnt1 = sort_sample_names(df_cnt1)
df_cnt2 = sort_sample_names(df_cnt2)

df_rpkm1 = sort_sample_names(df_rpkm1)
df_rpkm2 = sort_sample_names(df_rpkm2)

df_vsd1 = sort_sample_names(df_vsd1)
df_vsd2 = sort_sample_names(df_vsd2)

In [12]:
# df_cnt1.columns[:5]
df_vsd1.iloc[1:3, 1:10]

Unnamed: 0_level_0,143B,23132/87,501A,537 MEL,59M,624 mel,769-P,786-O,888
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10,8.765338,9.503171,8.862888,7.992651,7.992651,8.873494,9.035808,8.818618,9.022415
100,10.654475,10.258306,10.37446,10.605506,10.390509,10.394178,9.942318,10.292453,10.742507


In [13]:
df_rpkm1.iloc[1:3, 1:10]

Unnamed: 0_level_0,143B,23132/87,501A,537 MEL,59M,624 mel,769-P,786-O,888
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10,0.029724,1.015054,0.049337,0.0,0.0,0.050882,0.111179,0.036654,0.105182
100,14.641576,7.786291,9.415679,13.419594,8.221902,9.513701,3.481694,7.583366,15.665344


In [14]:
df_cnt1.iloc[1:3, 1:10]

Unnamed: 0_level_0,143B,23132/87,501A,537 MEL,59M,624 mel,769-P,786-O,888
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10,5,39,1,0,0,2,7,3,4
100,2865,348,222,528,239,435,255,722,693


## Problem with non-unique mappings btw `gene_name` and `entrez` identifiers

In [12]:
datapath = '/Users/apartin/work/jdacs/cell-line-raw-data/gcsi/original/ArrayExpress/E-MTAB-2706.additional.1'
df_gene_annot = pd.read_csv(os.path.join(datapath, '140625_Klijn_geneToTranscript.txt'), sep='\t')
df_gene_annot = df_gene_annot.sort_values(by='gene_name').reset_index(drop=True)
print(df_gene_annot.shape)
display(df_gene_annot[:4])
print(df_gene_annot.nunique())


(26010, 5)


Unnamed: 0,gene_id,transcript_id,gene_name,transcript_name,entrez
0,GeneID:100533182,NR_039978,1/2-SBSRNA4,1/2-SBSRNA4,100533182
1,GeneID:440345,XM_003403461,61E3.4,61E3.4,440345
2,GeneID:1,NM_130786,A1BG,A1BG,1
3,GeneID:503538,NR_015380,A1BG-AS1,A1BG-AS1,503538


gene_id            26010
transcript_id      26010
gene_name          26007
transcript_name    26007
entrez             26010
dtype: int64


**Note that there are more genes in the dataframes than in the gene mapping table (??)**<br>
In the mappings there are 26010 genes<br>
In the dataframes there are 26094 genes<br>

In [13]:
print(df_cnt1.shape)
print(df_gene_annot.shape)

(26094, 676)
(26010, 5)


**Keep only those genes that appear in the gene annotations table**<br>
Note that the name mappings between **entrez** and **gene_name** is not unique.

In [14]:
df = df_cnt1.copy()
to_name = 'gene_name'

df = df.rename(columns={'gene_id':'entrez'})
df = df_gene_annot[['entrez', to_name]].merge(df, on='entrez', how='left')
print(df[['entrez', to_name]].nunique())

entrez       26010
gene_name    26007
dtype: int64


**Find genes with ambiguous names (gene names that repeat more than once)**

In [15]:
rep = df[to_name].value_counts()[df[to_name].value_counts() > 1]
print('Genes names that repeat more than once:', rep.index.tolist())
print(rep)

Genes names that repeat more than once: ['DUX4', 'TTL', 'ZNF605']
DUX4      2
TTL       2
ZNF605    2
Name: gene_name, dtype: int64


In [16]:
for gene_name in rep.index:
    display(df.loc[df[to_name]==gene_name, ['entrez', to_name]])

Unnamed: 0,entrez,gene_name
5326,100288687,DUX4
5327,22947,DUX4


Unnamed: 0,entrez,gene_name
24260,646982,TTL
24261,150465,TTL


Unnamed: 0,entrez,gene_name
25735,90462,ZNF605
25736,100289635,ZNF605


There are genes with ambiguous names (repeats more than onces).
These are:
Gene names:  Gene entrez

| Gene name | repeats | entrez           | final entrez | why |
| --------- | ------- | ---------------- | ------------ | --- |
| ZNF605    | 2       | 90462, 100289635 | 100289635    | 90462 was dropped by NCBI |
| DUX4      | 2       | 22947, 100288687 | 100288687    | 22947 is marked as pseudo |
| TTL       | 2       | 646982, 150465   | 150465       | 150465 is protein coding |

ZNF605:<br>
https://www.ncbi.nlm.nih.gov/gene/?term=100289635<br>
https://www.ncbi.nlm.nih.gov/gene/?term=90462<br>
DUX4:<br>
https://www.ncbi.nlm.nih.gov/gene/?term=100288687<br>
https://www.ncbi.nlm.nih.gov/gene/?term=22947<br>
TTL:<br>
https://www.ncbi.nlm.nih.gov/gene/?term=150465<br>
https://www.ncbi.nlm.nih.gov/gene/?term=646982<br>

We need to drop the following genes based on the entrez label:<br>
90462, 22947, 646982

### Finally, rename the genes and drop duplicates gene names (based on the analysis above)

In [17]:
def rename_genes(df, df_gene_annot, to_name='gene_name'):
    """
    Args:
        df : expression df
        df_gene_annot : gene name mappings
        to_name : gene names will be mapped to the names that appear in the to_name col
    """
    # Map entrez gene names to to_name, while dropping expression for those genes that don't
    # appear in the mapping table
    df = df.copy()
    assert 'gene_id' in df.columns.tolist(), "col 'gene_id' is not in the input df."
        
    df = df.rename(columns={'gene_id':'entrez'})
    df = df_gene_annot[['entrez', to_name]].merge(df, on='entrez', how='left')
    # print(df[['entrez', to_name]].nunique())
    
    # Drop one of the ambiguous genes
    entrez_to_drop = [90462, 22947, 646982]
    idx = df['entrez'].map(lambda x: False if x in entrez_to_drop else True)
    df = df[idx]
    # print(df[['entrez', 'gene_name']].nunique())
    
    # Drop entrez col
    df.drop(columns='entrez', inplace=True)
    return df

In [18]:
df_cnt1 = rename_genes(df=df_cnt1, df_gene_annot=df_gene_annot, to_name='gene_name')
df_cnt2 = rename_genes(df=df_cnt2, df_gene_annot=df_gene_annot, to_name='gene_name')

df_rpkm1 = rename_genes(df=df_rpkm1, df_gene_annot=df_gene_annot, to_name='gene_name')
df_rpkm2 = rename_genes(df=df_rpkm2, df_gene_annot=df_gene_annot, to_name='gene_name')

df_vsd1 = rename_genes(df=df_vsd1, df_gene_annot=df_gene_annot, to_name='gene_name')
df_vsd2 = rename_genes(df=df_vsd2, df_gene_annot=df_gene_annot, to_name='gene_name')

## Other metadata: 'E-MTAB-2706.sdrf.txt'

In [19]:
datapath = os.path.join(DATADIR, 'ArrayExpress', 'E-MTAB-2706.sdrf.txt')
df_sdrf = pd.read_csv(datapath, sep='\t')
print(df_sdrf.shape)
display(df_sdrf.transpose().iloc[:,:3])
# display(df_sdrf)

(1350, 36)


Unnamed: 0,0,1,2
Source Name,Sample 1,Sample 1,Sample 2
Comment[ENA_SAMPLE],ERS395207,ERS395207,ERS395272
Material Type,cell,cell,cell
Characteristics[organism],Homo sapiens,Homo sapiens,Homo sapiens
Term Source REF,NCBI Taxonomy,NCBI Taxonomy,NCBI Taxonomy
Term Accession Number,9606,9606,9606
Characteristics[cell line],A2780,A2780,COLO 679
Characteristics[tissue supergroup],ovary,ovary,skin
Characteristics[metatastic tissue],not applicable,not applicable,not applicable
Characteristics[organism part],ovary,ovary,skin
