In [None]:
import os
import pandas as pd
import numpy as np
from ecfp import generate_ecfp6_df
from physico import generate_physico_df
from toxico import generate_toxico_df
from cellline import process_cellline_matrixproject_folder = os.path.dirname(os.path.abspath("__file__"))
data_folder = os.path.join(project_folder, "dataset")
os.makedirs(data_folder, exist_ok=True)

smiles_file  = os.path.join(data_folder, "smiles.csv")
labels_file  = os.path.join(data_folder, "labels.csv")
matrix_path  = os.path.join(data_folder, "matrix.csv")
annot_path   = os.path.join(data_folder, "annotations.csv")
alerts_file  = os.path.join(data_folder, "alert_collection.csv")

target_cells = [
    'A2058','A2780','A375','A427','CAOV3','COLO320DM','DLD1','EFM192B','ES2',
    'HCT116','HT144','HT29','KPL1','LNCAP','LOVO','MDAMB436','MSTO','NCIH1650',
    'NCIH2122','NCIH23','NCIH460','NCIH520','OCUBM','OV90','OVCAR3','PA1','RKO',
    'RPMI7951','SKMEL30','SKMES1','SKOV3','SW620','SW837','T47D','UACC62',
    'UWB1289','UWB1289BRCA1','VCAP','ZR751'
]

ecfp_df = generate_ecfp6_df(smiles_file)
phys_df = generate_physico_df(smiles_file, labels_file)
toxico_df = generate_toxico_df(smiles_file, alerts_file)
expr_df = process_cellline_matrix(matrix_path, annot_path, target_cells)  # removed output_file

labels = pd.read_csv(labels_file)
labels.columns = labels.columns.str.strip()

def make_AB_tables(df, drug_col='Drug'):
    A = df.copy().add_prefix('A_').rename(columns={f"A_{drug_col}": 'drug_a_name'})
    B = df.copy().add_prefix('B_').rename(columns={f"B_{drug_col}": 'drug_b_name'})
    return A, B

ecfp_A, ecfp_B = make_AB_tables(ecfp_df)
phys_A, phys_B = phys_df, phys_df
toxico_A, toxico_B = make_AB_tables(toxico_df)

df = labels.copy()
df = df.merge(ecfp_A, on='drug_a_name', how='left')
df = df.merge(ecfp_B, on='drug_b_name', how='left')
df = pd.concat([
    df,
    phys_A.reset_index(drop=True),
    phys_B.reset_index(drop=True),
    toxico_A.reset_index(drop=True),
    toxico_B.reset_index(drop=True)
], axis=1)

genes_t = expr_df.T.reset_index().rename(columns={'index': 'cell_line'})
df = df.merge(genes_t, on='cell_line', how='left')

output_file = os.path.join(data_folder, "Feature_vectors_dataset.csv")
df.to_csv(output_file, index=False)
print("Shape after concatenation:", df.shape)

df_filtered = df.loc[:, df.nunique() > 1]
output_file_zv = os.path.join(data_folder, "Feature_vectors_dataset_zeroVarRemoved.csv")
df_filtered.to_csv(output_file_zv, index=False)
print("Shape after zero-variance filtering:", df_filtered.shape)




TypeError: process_cellline_matrix() got an unexpected keyword argument 'output_file'