In [1]:
import pandas as pd

In [2]:
binding_scores = pd.read_csv('drug_gene_matrix.csv')
reg_scores = pd.read_csv('disease_drug_interactions.csv')

In [3]:
dis_gene = {}
for col in reg_scores.columns:  
    if col[:4] == 'DOID':
        values = reg_scores[col].values
        not_nan_values = values[~pd.isna(values)]
        not_nan_index = [i for i, v in enumerate(values) if not pd.isna(v)]
        print(f"{col},Indices of Non-NaN values: {not_nan_index}")
        dis_gene[col] = not_nan_index


DOID:1240,Indices of Non-NaN values: [0, 1, 2]
DOID:10591,Indices of Non-NaN values: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
DOID:5603,Indices of Non-NaN values: [34, 35, 36, 37, 41, 42, 43, 46, 48, 49, 54, 58, 59, 60, 61, 62, 63, 64, 65]
DOID:2801,Indices of Non-NaN values: [23, 44, 47, 50, 52, 53, 55, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77]
DOID:11723,Indices of Non-NaN values: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 45, 51, 56, 57, 78, 79, 80]


In [4]:
rm_cols = []
for col in reg_scores.columns:
    if col not in binding_scores.columns:
        rm_cols.append(col)
print(f"Columns to be removed: {rm_cols}")
reg_scores.drop(columns=rm_cols, inplace=True)

Columns to be removed: ['DOID:1240', 'DOID:10591', 'Etanercept', 'Oprelvekin', 'Interferon beta-1a', 'Cetuximab', 'Interferon beta-1b', 'Natural alpha interferon', 'Interferon gamma-1b', 'DOID:5603', 'DOID:2801', 'Rituximab', 'Casiigly', 'DOID:11723']


In [5]:
df1 = binding_scores.set_index('gene_names')
df2 = reg_scores.set_index('gene_names')
df1.fillna(1, inplace=True)
df2.fillna(0, inplace=True)
product = df1.mul(df2, fill_value=0)
product.reset_index(inplace=True)

In [6]:
product.head()

Unnamed: 0,gene_names,Anastrozole,Abiraterone,Bexarotene,Metformin,Dexamethasone,Cigarette smoke,Harman,Plx4032,Atorvastatin,...,Mln4924,Sangivamycin (nsc 65346),Gamma-tocotrienol,Pioglitazone,Tretinoin,Cycloheximide,Ciprofloxacin,Motexafin gadolinium,Nicotine,Sulforaphane
0,ACTC1,0.0,0.0,0.0,0.0,-13.574736,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ADAM8\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ADAM8\r\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ANKRD37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,APOC1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
import os
os.makedirs('pred_drugs', exist_ok=True)

In [8]:
def sanitize_name(x):
    if isinstance(x, (list, tuple, set)):
        return "_".join(map(str, x))
    if hasattr(x, "tolist"):
        return "_".join(map(str, x.tolist()))
    return str(x).replace(":", "_").replace(" ", "_")


In [11]:
dis_drug = {}

for disease, indices in dis_gene.items():
    disease_name = sanitize_name(disease)
    print(disease_name)
    # IMPORTANT: use loc if indices are gene names
    sub = product.loc[indices, product.columns[1:]]

    if sub.empty:
        print(f"⚠️ No data for {disease_name}")
        continue

    s = sub.sum(axis=0).dropna()
    s_nonzero = s[s != 0].sort_values()

    out_path = os.path.join('pred_drugs', f"{disease_name}_drugs.txt")
    
    print(s_nonzero)
    with open(out_path, "w") as f:
        f.write("Drug\tScore\n")
        for drug, score in s_nonzero.items():
            f.write(f"{drug}\t{score}\n")

    dis_drug[disease_name] = s_nonzero.to_dict()


DOID_1240
Propofol         -724.005339
Sevoflurane      -632.277981
Hydrocortisone   -147.242160
Resveratrol       -88.583586
Testosterone      -65.390780
Paclitaxel        -44.560100
Dexamethasone     -13.574736
dtype: float64
DOID_10591
Vanadium pentoxide                           -20544.353904
2,3,7,8-tetrachlorodibenzo-p-dioxin (tcdd)   -19687.230885
Mercury                                      -18200.881875
Metformin                                    -14854.391564
Arsenic                                      -10362.555122
                                                  ...     
Y15                                              -4.792294
Abiraterone                                      -3.394200
Fenretinide                                      -2.794644
Testosterone                                     -2.422769
Fluticasone                                      -0.728695
Length: 82, dtype: float64
DOID_5603
2,3,7,8-tetrachlorodibenzo-p-dioxin (tcdd)   -80971.046982
Mercury         