## Annotate probe IDs with gene symbols 

In order to identify the genes that compounds with high MAS and low TAS influence

In [1]:
import pathlib
import pandas as pd
import numpy as np

In [2]:
top_n_cpds = 6
bottom_n_cpds = 2 
gene_cut = 2

In [3]:
# Load mapping resource
# Downloaded from: http://amp.pharm.mssm.edu/public/L1000CDS_download/
url = "http://amp.pharm.mssm.edu/public/L1000CDS_download/apiRowMeta.json"
map_df = pd.read_json(url)

# Setup a dictionary to rename the map
updater = dict(zip(map_df.pr_id, map_df.pr_gene_symbol))

print(map_df.shape)
map_df.head()

(36590, 10)


Unnamed: 0,is_lm,is_l1000,pr_id,pr_gene_id,pr_gene_title,ds_index,pr_gene_symbol,_id,is_bing,pr_pool_id
0,0.0,True,202938_x_at,100510314///100510451///27341///91695,-666,10985.0,-666,51438660dfe7719b8846fd9c,0.0,[inferred]
1,0.0,True,204006_s_at,2214///2215,-666,11270.0,-666,51438660dfe7719b8846feb9,0.0,[inferred]
2,0.0,True,204060_s_at,5613///5616,-666,11287.0,-666,51438660dfe7719b8846feca,0.0,[inferred]
3,0.0,True,204419_x_at,3047///3048,-666,11399.0,-666,51438660dfe7719b8846ff3a,0.0,[inferred]
4,0.0,True,204438_at,414308///4360,-666,11405.0,-666,51438660dfe7719b8846ff40,0.0,[inferred]


In [4]:
# Load activity scores
file = pathlib.Path("../6.paper_figures/data/highmas_lowtas_compounds.tsv")
activity_df = pd.read_csv(file, sep="\t")

print(activity_df.shape)
activity_df.head(3)

(137, 5)


Unnamed: 0,cpd,mas_mean,tas_mean,cpd_count,mas_tas_dff
0,alisertib,0.709348,0.200119,60,0.509229
1,dasatinib,0.725527,0.24769,60,0.477837
2,brequinar,0.638118,0.198525,60,0.439593


In [5]:
# What are the top compounds that change lots of MAS but not TAS
top_cpds = activity_df.head(top_n_cpds).cpd.tolist()
top_cpds

['alisertib', 'dasatinib', 'brequinar', 'aphidicolin', 'at13387', 'sta-5326']

In [6]:
# What are the top compounds that change lots of TAS but not MAS
bottom_cpds = activity_df.sort_values(by="mas_tas_dff").head(bottom_n_cpds).cpd.tolist()
bottom_cpds

['l-ergothioneine', 'lasalocid']

In [7]:
focus_cps = top_cpds + bottom_cpds

In [8]:
# Load L1000 data to obtain high differential genes
data_dir = pathlib.Path("../1.Data-exploration/")

file = pathlib.Path(f"{data_dir}/Consensus/L1000/moa_sizes_consensus_datasets/modz_level5_data.csv")
df = pd.read_csv(file)

df = df.query("pert_iname in @focus_cps").reset_index(drop=True)

print(df.pert_iname.value_counts())
print(df.shape)
df.head(2)

sta-5326           6
dasatinib          6
lasalocid          6
l-ergothioneine    6
at13387            6
alisertib          6
brequinar          6
aphidicolin        6
Name: pert_iname, dtype: int64
(48, 980)


Unnamed: 0,sig_id,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,...,212536_at,218529_at,211071_s_at,203341_at,205379_at,pert_id,pert_idose,dose,pert_iname,moa
0,REP.A004_A549_24H:P01,-2.190391,-0.158385,-0.239082,-0.454267,1.424989,-1.496949,0.057564,0.511198,2.971821,...,-0.63888,-2.140566,1.29253,0.114603,-3.034249,BRD-K26341917,10 uM,6,l-ergothioneine,free radical scavenger
1,REP.A004_A549_24H:P02,-4.144145,-1.078669,0.991581,0.536611,5.4022,-1.045628,-1.736026,-0.292926,4.789339,...,2.06238,1.383225,-0.037615,-1.965599,-2.945721,BRD-K26341917,3.33 uM,5,l-ergothioneine,free radical scavenger


In [9]:
# Obtain background gene lists
background_df = pd.DataFrame(
    df.columns[df.columns.str.endswith("_at")],
    columns=["probe"]
)

background_df = background_df.assign(gene_symbol=background_df.probe.replace(updater))

output_file = pathlib.Path("results", "background_gene_list.tsv")
background_df.to_csv(output_file, sep="\t", index=False)

background_df.head()

Unnamed: 0,probe,gene_symbol
0,200814_at,PSME1
1,222103_at,ATF1
2,201453_x_at,RHEB
3,204131_s_at,FOXO3
4,200059_s_at,RHOA


In [10]:
expression_df = (
    df
    .groupby(["pert_iname", "moa"])
    .median()
    .reset_index()
    .melt(
        id_vars=["pert_iname", "moa"],
        value_vars=df.columns[df.columns.str.endswith("_at")],
        value_name="L1000_readout",
        var_name="L1000_probe"
    )
)

expression_df = (
    expression_df
    .assign(L1000_abs_readout = expression_df.L1000_readout.abs())
    .query("L1000_abs_readout > @gene_cut")
    .sort_values(by="pert_iname")
    .reset_index(drop=True)
)

expression_df = expression_df.assign(gene_symbol=expression_df.L1000_probe.replace(updater))

output_file = pathlib.Path("results", "differential_mas_vs_tas_genes.tsv")
expression_df.to_csv(output_file, sep="\t", index=False)

print(expression_df.shape)
expression_df.head()

(702, 6)


Unnamed: 0,pert_iname,moa,L1000_probe,L1000_readout,L1000_abs_readout,gene_symbol
0,alisertib,aurora kinase inhibitor,205450_at,-2.12745,2.12745,PHKA1
1,alisertib,aurora kinase inhibitor,215903_s_at,2.36095,2.36095,MAST2
2,alisertib,aurora kinase inhibitor,205039_s_at,4.63955,4.63955,IKZF1
3,alisertib,aurora kinase inhibitor,218346_s_at,3.922725,3.922725,SESN1
4,alisertib,aurora kinase inhibitor,40829_at,2.388125,2.388125,WDTC1


In [11]:
# Which genes are consistently implicated?
gene_count_df = (
    expression_df
    .gene_symbol
    .value_counts()
    .reset_index()
    .rename({"index": "gene", "gene_symbol": "cpd_count"}, axis="columns")
)

gene_count_df.head(10)

Unnamed: 0,gene,cpd_count
0,EIF5,6
1,GNAS,5
2,IARS2,5
3,PPP2R3C,5
4,ADI1,4
5,CCNB2,4
6,NRIP1,4
7,ADCK3,4
8,CYCS,4
9,ACAT2,4
