In [1]:
# Import all packages
import os
import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
# Global parameters / values
dataAux_dir = "../data_aux/"
results_dir = "../results/"
Klaeger_filename = "Klaeger.csv"
Huang_filename = "Huang.csv"
Annes100_filename = "Annes100.csv"
Annes500_filename = "Annes500.csv"

# Name of column of official gene symbols
geneSymbolColumn = "GeneSymbol"

# Thresholds for converting kinase activity metrics to boolean values
# - use_diff: bool
#     Use difference between OTSSP167 and another compound as value to threshold
#       For Annes and Huang datasets: use the absolute difference and compare to threshold diff_percent_thresh
#       For Klaeger dataset: use fold difference and compare to threshold diff_fold_thresh
# - diff_percent_thresh: int or float
#     Threshold absolute difference (% control (Annes) or % activity remaining (Huang)) between a less toxic compound
#     and OTSSP167 at which a target (kinase) is considered a potential target for the toxicity of OTSSP167
# - diff_fold_thresh: int or float
#     Threshold fold difference (Kd_app) between a less toxic compound and OTSSP167 at which a target (kinase)
#     is considered a potential target for the toxicity of OTSSP167
# - min_percent_thresh: int or float
#     Threshold % control (Annes) or % activity remaining (Huang) at which a target is considered to be inhibited by OTSSP167
# - max_percent_thresh: int or float
#     Threshold % control (Annes) or % activity remaining (Huang) at which a target is considered to be not inhibited by
#     a less toxic compound
use_diff = True
diff_percent_thresh = 20
diff_fold_thresh = 20
min_percent_thresh = 25
max_percent_thresh = 75

# Compare 100nM OTSSP167 to 100nM STF1285 (Annes100_filename) or 500nM STF1285 (Annes500_filename)
Annes_filename = Annes100_filename

In [3]:
## Read in data
df1 = pd.read_csv(os.path.join(dataAux_dir, Klaeger_filename))
df2 = pd.read_csv(os.path.join(dataAux_dir, Huang_filename))
df3 = pd.read_csv(os.path.join(dataAux_dir, Annes_filename))

## Boolean filtering

Add new column `bool` to each DataFrame that indicates whether target is both inhibited by OTSSP167 and *not* inhibited by a less toxic compound. Find the intersection across all datasets.

In [4]:
df1['bool'] = np.nan
df2['bool'] = np.nan
df3['bool'] = np.nan

In [5]:
if use_diff:
    df1['bool'] = df1['CC401'] / df1['OTSSP167'] >= diff_fold_thresh
    df2['bool'] = df2['HTH01091'] - df2['OTSSP167'] >= diff_percent_thresh
    df3['bool'] = df3['STF1285'] - df3['OTSSP167'] >= diff_percent_thresh
else:
    df1.loc[np.isinf(df1['CC401']) & (df1['OTSSP167'] < np.inf), 'bool'] = True
    df2.loc[(df2['HTH01091'] >= max_percent_thresh) & (df2['OTSSP167'] <= min_percent_thresh), 'bool'] = True
    df3.loc[(df3['STF1285'] >= max_percent_thresh) & (df3['OTSSP167'] <= min_percent_thresh), 'bool'] = True

In [6]:
intersect = pd.merge(df1, df2, how='inner', on=[geneSymbolColumn, 'bool'])
intersect = pd.merge(intersect, df3, how='inner', on=[geneSymbolColumn, 'bool'])
intersect = intersect.loc[intersect["bool"] == True, geneSymbolColumn]
intersect.sort_values(inplace=True)
intersect.reset_index(drop=True, inplace=True)
display(intersect)

0        ABL1
1       AURKA
2       AURKB
3         BTK
4      CAMKK2
5        CDK2
6        CDK9
7     CSNK2A1
8        DDR2
9       EPHB2
10      GSK3B
11      IKBKE
12      IRAK1
13      IRAK4
14        LCK
15     MAP2K1
16    MAP3K11
17     MAP4K5
18     MAPK10
19     MAPK15
20       PAK4
21     PRKAA1
22    RPS6KA1
23    RPS6KA3
24    RPS6KA5
25       SIK3
26      STK16
27       STK3
28       TBK1
Name: GeneSymbol, dtype: object

In [7]:
with open(os.path.join(results_dir, "intersect.txt"), "w") as f:
    f.write("\n".join(list(intersect)))
    f.write("\n")

## Rank ordering

In [8]:
df1['diff'] = df1['CC401'] - df1['OTSSP167']
df1.sort_values(by=['diff','OTSSP167'], ascending=[False,True], inplace=True)
df1.reset_index(drop=True, inplace=True)
df1['rank'] = df1.index / len(df1)

df2['diff'] = df2['HTH01091'] - df2['OTSSP167']
df2.sort_values(by=['diff','OTSSP167'], ascending=[False,True], inplace=True)
df2.reset_index(drop=True, inplace=True)
df2['rank'] = df2.index / len(df2)

df3['diff'] = df3['STF1285'] - df3['OTSSP167']
df3.sort_values(by=['diff','OTSSP167'], ascending=[False,True], inplace=True)
df3.reset_index(drop=True, inplace=True)
df3['rank'] = df3.index / len(df3)

In [9]:
targets = list(set(df1[geneSymbolColumn]) & set(df2[geneSymbolColumn]) & set(df3[geneSymbolColumn]))

In [10]:
rank = pd.Series({target: (df1.loc[df1[geneSymbolColumn] == target, 'rank'].values[0] +
                           df2.loc[df2[geneSymbolColumn] == target, 'rank'].values[0] +
                           df3.loc[df3[geneSymbolColumn] == target, 'rank'].values[0]) for target in targets})
rank.sort_values(ascending=True, inplace=True)
with pd.option_context('display.max_rows', None):
    display(rank)

CAMKK2      0.308940
STK16       0.331851
IKBKE       0.333663
CDK9        0.377823
GSK3B       0.394740
MAPK10      0.435189
AURKA       0.553542
MAPK8       0.605219
MAPK9       0.656360
LCK         0.816810
TBK1        0.836174
MAP3K11     0.842789
SIK3        0.870554
TAOK1       0.878989
MAP2K1      0.884681
CDK2        0.904393
PAK4        0.926483
AURKB       0.956594
RPS6KA3     0.964508
EPHB4       0.974307
IRAK4       0.981657
MAP4K5      1.000901
STK3        1.013383
IRAK1       1.034375
ABL1        1.061081
MAPK15      1.101414
EPHB2       1.121680
RPS6KA1     1.126646
RPS6KA5     1.132501
DDR2        1.141264
BTK         1.145414
PIM1        1.189215
MAP3K5      1.189387
STK26       1.190169
CSNK2A1     1.231818
TGFBR1      1.262092
MAP2K2      1.263892
PRKAA1      1.306473
EPHA2       1.324888
RPS6KB1     1.326404
EPHA4       1.388369
MARK2       1.397299
MAP2K6      1.403014
SIK2        1.404141
CSK         1.438198
MAPKAPK5    1.452587
RIPK2       1.466090
MARK3       1

In [11]:
rank.to_csv(os.path.join(results_dir, "rank.tsv"), index=True, sep="\t")

## Boolean filtering based on increasing STF-1285 concentration

This is not meant to identify toxic targets of OTSSP167 but instead to get an idea of targets that may be responsible for toxicity of STF-1285 at higher concentrations.

In [12]:
# Parameters
diff_percent_thresh = 50
min_percent_thresh = 25
max_percent_thresh = 75

In [13]:
df4 = pd.read_csv(os.path.join(dataAux_dir, Annes100_filename))
df5 = pd.read_csv(os.path.join(dataAux_dir, Annes500_filename))

In [14]:
# merge 100 nM and 500 nM datasets
stf1285 = pd.merge(df4, df5, how='inner', on=[geneSymbolColumn], suffixes=(" (100 nM)", " (500 nM)"))
stf1285 = stf1285[[geneSymbolColumn, "STF1285 (100 nM)", "STF1285 (500 nM)"]]
stf1285['diff'] = stf1285["STF1285 (100 nM)"] - stf1285["STF1285 (500 nM)"]
stf1285.sort_values(by='diff', ascending=False, inplace=True)
stf1285.reset_index(drop=True, inplace=True)

In [15]:
stf1285['bool'] = (stf1285['diff'] >= diff_percent_thresh) #& \
                  #(stf1285['STF1285 (500 nM)'] <= min_percent_thresh) & \
                  #(stf1285['STF1285 (100 nM)'] >= max_percent_thresh)
stf1285.loc[stf1285['bool'] == True, geneSymbolColumn]

0     CSNK2A1
1      MAP2K3
2        JAK3
3       MARK2
4       NUAK2
5       ROCK1
6       PHKG2
7       LRRK2
8        MELK
9        BUB1
10     BMPR1B
11        CIT
12       AAK1
13        SLK
14     MAP4K3
15    CSNK2A2
16      MYO3B
17      EPHB6
18      STK10
Name: GeneSymbol, dtype: object