In [1]:
import pandas as pd
import os
data_folder = r"C:\DeepSynergy\dataset"
labels_file = os.path.join(data_folder, "labels.csv")
ecfp_file   = os.path.join(data_folder, "ecfp.csv")
phys_file   = os.path.join(data_folder, "physio.csv")
toxico_file = os.path.join(data_folder, "toxico.csv")
gene_file   = os.path.join(data_folder, "cellline_features.csv")
labels = pd.read_csv(labels_file)
ecfp   = pd.read_csv(ecfp_file)
phys   = pd.read_csv(phys_file)
toxico = pd.read_csv(toxico_file)
genes  = pd.read_csv(gene_file, index_col=0)
labels.columns = labels.columns.str.strip()
ecfp.columns   = ecfp.columns.str.strip()
phys.columns   = phys.columns.str.strip()
toxico.columns = toxico.columns.str.strip()
genes.columns  = genes.columns.str.strip()
if labels.columns[0].startswith("Unnamed") or labels.columns[0] == "":
    labels = labels.rename(columns={labels.columns[0]: "pair_id"})

def make_AB_tables(df, drug_col='Drug'):
    A = df.copy()
    B = df.copy()
    A = A.add_prefix('A_')
    B = B.add_prefix('B_')
    A = A.rename(columns={'A_' + drug_col: 'drug_a_name'})
    B = B.rename(columns={'B_' + drug_col: 'drug_b_name'})
    return A, B

ecfp_A, ecfp_B = make_AB_tables(ecfp)

phys_A = phys.add_prefix('A_')
phys_B = phys.add_prefix('B_')

toxico_A = toxico.add_prefix('A_')
toxico_B = toxico.add_prefix('B_')

df = labels.copy()

df = df.merge(ecfp_A, on='drug_a_name', how='left')
df = df.merge(ecfp_B, on='drug_b_name', how='left')
df = pd.concat([df, phys_A.reset_index(drop=True), phys_B.reset_index(drop=True),
                toxico_A.reset_index(drop=True), toxico_B.reset_index(drop=True)], axis=1)
genes_t = genes.T.reset_index().rename(columns={'index': 'cell_line'})
df = df.merge(genes_t, on='cell_line', how='left')
output_file = os.path.join(data_folder, "Feature_vectors_dataset.csv")
df.to_csv(output_file, index=False)

print("Saved:", output_file)
print("Final shape:", df.shape)


Saved: C:\DeepSynergy\dataset\Feature_vectors_dataset.csv
Final shape: (23052, 11072)
