In [2]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
t = np.load("idxs.npy", allow_pickle=True)[1]
df = pd.read_csv("attention.csv.gz", index_col=0, low_memory=False)
df.index = t
df.columns = t
drug = df[:269]
drug_gene = drug.iloc[:, 329:]

In [4]:
name = dict(pd.read_csv("nsc_dti.csv", index_col=0)[["NSC", "Drug Name"]].values)

In [5]:
drug_gene.index = [name[i] for i in drug_gene.index]

In [6]:
graph = pd.DataFrame()
for i in ["Irinotecan", "Topotecan", "Camptothecin", "Daunorubicin", "Doxorubicin"]:
    tmp = drug_gene.loc[i]
    if tmp.shape[0] > 10:
        tmp = tmp.sort_values(ascending=False)[:5]
    else:
        tmp = tmp.iloc[1].sort_values(ascending=False)[:5]

    tmp = pd.DataFrame(tmp)
    tmp["name"] = list(tmp.columns) * 5
    tmp = tmp.reset_index()
    tmp.columns = ["genes", "Attention", "name"]
    graph = pd.concat([graph, tmp])

graph["name"] = graph["name"].str.replace(" hydrochloride", "")
graph = graph.reset_index(drop=True)

In [7]:
graph.to_csv("graph.csv")

In [8]:
dti = pd.read_csv("dti_drugbank.csv", index_col=0)
dti.index = [name[i] for i in dti.index]
dti = dti.loc[
    dti.index.isin(set(graph.name)), dti.columns.isin(set(graph.genes))
].drop_duplicates()

In [9]:
dtis = pd.DataFrame()
for i in dti.index:
    if i == "Doxorubicin":
        tmp = (
            (dti.loc["Doxorubicin"])
            .groupby(dti.loc["Doxorubicin"].index)
            .mean()
            .loc["Doxorubicin"]
            .reset_index()
        )
    else:
        tmp = dti.loc[i].reset_index()
    tmp["drug"] = [i] * len(tmp)
    tmp = tmp[tmp[i] > 0]
    tmp.columns = ["genes", "dtis", "name"]
    dtis = pd.concat([dtis, tmp])

In [10]:
dtis = pd.concat(
    [
        dtis,
        pd.DataFrame(
            [["TOP1", 1, "Topotecan"], ["TOP1MT", 1, "Topotecan"]], columns=dtis.columns
        ),
    ]
)
dtis

Unnamed: 0,genes,dtis,name
2,TOP2A,1.0,Daunorubicin
3,TOP2B,1.0,Daunorubicin
0,TOP1,1.0,Camptothecin
2,TOP2A,1.0,Doxorubicin
0,TOP1,1.0,Irinotecan
1,TOP1MT,1.0,Irinotecan
2,TOP2A,1.0,Doxorubicin
0,TOP1,1.0,Topotecan
1,TOP1MT,1.0,Topotecan


In [11]:
graph = graph.merge(dtis, how="left").fillna(0)
graph

Unnamed: 0,genes,Attention,name,dtis
0,TOP1MT,0.004951,Irinotecan,1.0
1,TOP1,0.004766,Irinotecan,1.0
2,MIR6807,0.003672,Irinotecan,0.0
3,MPO,0.003669,Irinotecan,0.0
4,KRT14,0.003661,Irinotecan,0.0
5,TOP1MT,0.004957,Topotecan,1.0
6,TOP1,0.004772,Topotecan,1.0
7,MIR2278,0.003691,Topotecan,0.0
8,MIR6728,0.003681,Topotecan,0.0
9,MMP3,0.00368,Topotecan,0.0


In [13]:
weights_list = []
for i in set(graph.name):
    t = graph[graph.name == i]
    max_weight, min_weight = max(t["Attention"]), min(t["Attention"])
    # (weight - min_weight) / (max_weight - min_weight)
    weights_list.extend(list((t["Attention"] - min_weight) / (max_weight - min_weight)))

In [14]:
df = graph.copy()
df["Attention"] = weights_list

Unnamed: 0,genes,Attention,name,dtis
0,TOP1MT,1.0,Irinotecan,1.0
1,TOP1,1.0,Irinotecan,1.0
2,MIR6807,0.019051,Irinotecan,0.0
3,MPO,0.010319,Irinotecan,0.0
4,KRT14,0.001763,Irinotecan,0.0
5,TOP1MT,0.0,Topotecan,1.0
6,TOP1,1.0,Topotecan,1.0
7,MIR2278,0.561724,Topotecan,0.0
8,MIR6728,0.01278,Topotecan,0.0
9,MMP3,0.011399,Topotecan,0.0


In [13]:
df.to_csv("graph_norm.csv")

In [14]:
df = pd.read_csv("attention.csv.gz", index_col=0, low_memory=False)
tmp = np.load("idxs.npy", allow_pickle=True)[1]
df.index = tmp
df.columns = tmp
drug = df[:269]
drug_gene = drug.iloc[:, 329:]

In [15]:
def pad_dict(dictionary, pad_value):
    max_length = max(len(v) for v in dictionary.values())
    for key in dictionary:
        dictionary[key] += [pad_value] * (max_length - len(dictionary[key]))
    return dictionary

In [16]:
res = {}
for i in drug_gene.index:
    tmp = drug_gene.loc[i]
    tmp = tmp[tmp != 0]
    if len(tmp) > 0:
        res[i] = list(tmp.sort_values(ascending=False).index)

padded_dict = pad_dict(res, None)
res = pd.DataFrame(padded_dict).T

# Over-representation analysis

In [17]:
results = pd.DataFrame()
for i in tqdm(res.index):
    while True:
        try:
            enr = gp.enrichr(
                gene_list=list(res.loc[i].dropna()[:100]),
                gene_sets="MSigDB_Hallmark_2020",
                organism="human",
                outdir=None,
            )
            tmp = enr.results
            tmp = tmp[tmp["Adjusted P-value"] < 0.05].sort_values(["Adjusted P-value"])[
                ["Term", "Adjusted P-value", "Genes"]
            ]
            tmp["drug"] = len(tmp) * [i]
            results = pd.concat([results, tmp])
            break
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying...")

In [18]:
results.drug = results.drug.astype(int)
results.to_csv("gsea_results.csv")

In [19]:
name = dict(pd.read_csv("nsc_dti.csv", index_col=0)[["NSC", "Drug Name"]].values)

In [24]:
results = pd.read_csv("gsea_results.csv", index_col=0)
results

Unnamed: 0,Term,Adjusted P-value,Genes,drug
0,Epithelial Mesenchymal Transition,0.001643,SFRP4;POSTN;COL3A1;COL1A2;MMP1;MGP;MMP3,740
1,Angiogenesis,0.009602,POSTN;COL3A1;PRG2,740
2,Allograft Rejection,0.027699,CD7;IL2RG;CD3E;CD3D;ELANE,740
0,Complement,0.000028,CA2;CTSL;LCK;FDX1;PRKCD;PIM1;FYN;CTSS;PIK3CG,752
1,UV Response Up,0.002786,KCNH2;RET;CA2;NTRK3;PRKCD;HSPA2,752
...,...,...,...,...
4,UV Response Up,0.009275,KCNH2;RET;CA2;NTRK3;HSPA2,791785
5,PI3K/AKT/mTOR Signaling,0.012424,LCK;CDK1;RAF1;EGFR,791785
6,Apical Junction,0.014403,KCNH2;SYK;CDH11;MAPK14;EGFR,791785
7,E2F Targets,0.014403,POLE4;WEE1;MELK;CDK1;DCK,791785


In [25]:
results["drug"] = [name[int(i)] for i in results["drug"]]
results

Unnamed: 0,Term,Adjusted P-value,Genes,drug
0,Epithelial Mesenchymal Transition,0.001643,SFRP4;POSTN;COL3A1;COL1A2;MMP1;MGP;MMP3,METHOTREXATE
1,Angiogenesis,0.009602,POSTN;COL3A1;PRG2,METHOTREXATE
2,Allograft Rejection,0.027699,CD7;IL2RG;CD3E;CD3D;ELANE,METHOTREXATE
0,Complement,0.000028,CA2;CTSL;LCK;FDX1;PRKCD;PIM1;FYN;CTSS;PIK3CG,6-THIOGUANINE
1,UV Response Up,0.002786,KCNH2;RET;CA2;NTRK3;PRKCD;HSPA2,6-THIOGUANINE
...,...,...,...,...
4,UV Response Up,0.009275,KCNH2;RET;CA2;NTRK3;HSPA2,Sapacitabine
5,PI3K/AKT/mTOR Signaling,0.012424,LCK;CDK1;RAF1;EGFR,Sapacitabine
6,Apical Junction,0.014403,KCNH2;SYK;CDH11;MAPK14;EGFR,Sapacitabine
7,E2F Targets,0.014403,POLE4;WEE1;MELK;CDK1;DCK,Sapacitabine


# Choose terms which has at least 10 drugs.

In [26]:
df = (
    results.groupby("Term")["drug"]
    .count()
    .reset_index()
    .merge(results.groupby("Term")["Adjusted P-value"].mean().reset_index())
)
df = df.sort_values("drug", ascending=False)
df = df[df["drug"] > 10]