In [2]:
import os
import pandas as pd
import numpy as np
import pickle, gzip
from ecfp import generate_ecfp6_df
from physico import generate_physico_df
from toxico import generate_toxico_df
from cellline import process_cellline_matrix
project_folder = os.path.dirname(os.path.abspath("__file__"))
data_folder = os.path.join(project_folder, "dataset")
os.makedirs(data_folder, exist_ok=True)
smiles_file  = os.path.join(data_folder, "smiles.csv")
labels_file  = os.path.join(data_folder, "labels.csv")
matrix_path  = os.path.join(data_folder, "matrix.csv")
annot_path   = os.path.join(data_folder, "annotations.csv")
alerts_file  = os.path.join(data_folder, "alert_collection.csv")
target_cells = [
    'A2058','A2780','A375','A427','CAOV3','COLO320DM','DLD1','EFM192B','ES2',
    'HCT116','HT144','HT29','KPL1','LNCAP','LOVO','MDAMB436','MSTO','NCIH1650',
    'NCIH2122','NCIH23','NCIH460','NCIH520','OCUBM','OV90','OVCAR3','PA1','RKO',
    'RPMI7951','SKMEL30','SKMES1','SKOV3','SW620','SW837','T47D','UACC62',
    'UWB1289','UWB1289BRCA1','VCAP','ZR751'
]

In [3]:
ecfp_df   = generate_ecfp6_df(smiles_file)
phys_df   = generate_physico_df(smiles_file, labels_file)
toxico_df = generate_toxico_df(smiles_file, alerts_file)
expr_df   = process_cellline_matrix(matrix_path, annot_path, target_cells)
labels = pd.read_csv(labels_file)
labels.columns = labels.columns.str.strip()
if toxico_df.columns[-1] == "Drug":
    toxico_df = toxico_df[["Drug"] + [c for c in toxico_df.columns if c != "Drug"]]

  expr_39 = expr_39.groupby(expr_39.columns, axis=1).mean()


In [4]:
def make_AB(df, drug_col="Drug", prefix=""):
    A = df.copy().add_prefix(f"A_{prefix}")
    A = A.rename(columns={f"A_{prefix}{drug_col}": "drug_a_name"})
    B = df.copy().add_prefix(f"B_{prefix}")
    B = B.rename(columns={f"B_{prefix}{drug_col}": "drug_b_name"})
    return A, B
ecfp_A, ecfp_B = make_AB(ecfp_df, drug_col="Drug", prefix="ECFP_")
tox_A, tox_B   = make_AB(toxico_df, drug_col="Drug", prefix="TOX_")


In [5]:
df = labels.copy()
df = df.merge(ecfp_A, on="drug_a_name", how="left")
df = df.merge(ecfp_B, on="drug_b_name", how="left")
phys_df = phys_df.reset_index(drop=True)
df = pd.concat([df, phys_df], axis=1)
df = df.merge(tox_A, on="drug_a_name", how="left")
df = df.merge(tox_B, on="drug_b_name", how="left")
print("Drug features merged")

Drug features merged


In [6]:
genes_t = expr_df.T.reset_index().rename(columns={"index": "cell_line"})
gene_cols = [c for c in genes_t.columns if c != "cell_line"]
genes_t[gene_cols] = genes_t[gene_cols].astype("float32")
top_n = 4000  
gene_variances = genes_t[gene_cols].var(axis=0)
top_genes = gene_variances.sort_values(ascending=False).head(top_n).index
genes_t_top = genes_t[["cell_line"] + top_genes.tolist()]
chunk_size = 500
df_chunks = []
for start in range(0, genes_t_top.shape[0], chunk_size):
    chunk = genes_t_top.iloc[start:start+chunk_size]
    merged_chunk = df.merge(chunk, on="cell_line", how="left")
    df_chunks.append(merged_chunk)
df = pd.concat(df_chunks, ignore_index=True)
print("Gene features merged.")
print("Current shape:", df.shape)

Gene features merged.
Current shape: (23052, 10213)


In [7]:
out_pickle = os.path.join(data_folder, "Feature_vectors_dataset.p.gz")
with gzip.open(out_pickle, "wb") as f:
    pickle.dump(df, f)
print("Saved:", out_pickle)
df_zv = df.loc[:, df.nunique() > 1]
out_zv_pickle = os.path.join(data_folder, "Feature_vectors_dataset_zeroVarRemoved.p.gz")
with gzip.open(out_zv_pickle, "wb") as f:
    pickle.dump(df_zv, f)
print("Saved:", out_zv_pickle)

Saved: /mnt/c/Users/mayak/DeepSyn/feature_vector/dataset/Feature_vectors_dataset.p.gz
Saved: /mnt/c/Users/mayak/DeepSyn/feature_vector/dataset/Feature_vectors_dataset_zeroVarRemoved.p.gz


In [8]:
import gzip
import pickle
file_before_zv = r"/mnt/c/Users/mayak/DeepSyn/feature_vector/dataset/Feature_vectors_dataset.p.gz"
file_after_zv  = r"/mnt/c/Users/mayak/DeepSyn/feature_vector/dataset/Feature_vectors_dataset_zeroVarRemoved.p.gz"
def get_pickle_shape(file_path):
    with gzip.open(file_path, "rb") as f:
        df = pickle.load(f)
    return df.shape

shape_before = get_pickle_shape(file_before_zv)
shape_after  = get_pickle_shape(file_after_zv)

print("Shape before zero-variance filtering:", shape_before)
print("Shape after zero-variance filtering:", shape_after)


Shape before zero-variance filtering: (23052, 10213)
Shape after zero-variance filtering: (23052, 7094)
