In [1]:
import pandas as pd

In [2]:
drugcomb=pd.read_csv(r'drugcomb_clean.csv')

  drugcomb=pd.read_csv(r'drugcomb_clean.csv')


In [3]:
meta = pd.read_csv(r"sample_info.csv")


In [4]:
meta = meta[['cell_line_name', 'primary_disease']]

In [5]:
drugcomb = drugcomb.merge(meta, on='cell_line_name', how='left')

In [6]:
drugcomb.rename(columns={'primary_disease': 'cancer_type'}, inplace=True)

In [7]:
drugcomb['cancer_type'] = drugcomb['cancer_type'].fillna('Not Available')

In [8]:
# Load the drug-to-SMILES mapping
drugmap = pd.read_csv("comprehensive_drug_smiles.csv")  # drug_name and smiles columns

# Merge SMILES for drug_row
drugcomb = drugcomb.merge(
    drugmap[["drug_name", "smiles"]],
    how="left",
    left_on="drug_row",
    right_on="drug_name"
).rename(columns={"smiles": "drug_row_smiles"}).drop(columns=["drug_name"])

# Merge SMILES for drug_col
drugcomb = drugcomb.merge(
    drugmap[["drug_name", "smiles"]],
    how="left",
    left_on="drug_col",
    right_on="drug_name"
).rename(columns={"smiles": "drug_col_smiles"}).drop(columns=["drug_name"])


In [9]:
drugcomb = drugcomb.dropna(subset=["drug_row_smiles", "drug_col_smiles"])

In [10]:
from rdkit import Chem

def check_smiles(smiles):
    return Chem.MolFromSmiles(smiles) is not None

drugcomb = drugcomb[
    drugcomb['drug_row_smiles'].apply(check_smiles) &
    drugcomb['drug_col_smiles'].apply(check_smiles)
]



In [11]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
import warnings

# Suppress RDKit deprecation warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Add tqdm integration with pandas
tqdm.pandas(desc="Generating fingerprints")

# Function to compute Morgan fingerprints
def get_morgan_fp(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# Apply fingerprints with progress bar
drugcomb['drug_row_fp'] = drugcomb['drug_row_smiles'].progress_apply(get_morgan_fp)
drugcomb['drug_col_fp'] = drugcomb['drug_col_smiles'].progress_apply(get_morgan_fp)

print("Fingerprints generated for all drugs!")


Generating fingerprints: 100%|██████████| 561754/561754 [02:08<00:00, 4368.50it/s]
Generating fingerprints: 100%|██████████| 561754/561754 [02:21<00:00, 3976.64it/s]

Fingerprints generated for all drugs!





In [13]:
categorical_cols = []

if "cancer_type" in drugcomb.columns:
    categorical_cols.append("cancer_type")
if "tissue_name" in drugcomb.columns:
    categorical_cols.append("tissue_name")

print("Categorical columns to encode:", categorical_cols)

# Perform one-hot encoding
drugcomb_encoded = pd.get_dummies(drugcomb, columns=categorical_cols, prefix=categorical_cols)

print("After encoding:", drugcomb_encoded.shape)
print("New columns added:")
print([col for col in drugcomb_encoded.columns if any(c in col for c in categorical_cols)])

Categorical columns to encode: ['cancer_type', 'tissue_name']
After encoding: (561754, 54)
New columns added:
['cancer_type_Bladder Cancer', 'cancer_type_Bone Cancer', 'cancer_type_Brain Cancer', 'cancer_type_Breast Cancer', 'cancer_type_Colon/Colorectal Cancer', 'cancer_type_Gastric Cancer', 'cancer_type_Kidney Cancer', 'cancer_type_Leukemia', 'cancer_type_Lung Cancer', 'cancer_type_Lymphoma', 'cancer_type_Myeloma', 'cancer_type_Not Available', 'cancer_type_Ovarian Cancer', 'cancer_type_Pancreatic Cancer', 'cancer_type_Prostate Cancer', 'cancer_type_Sarcoma', 'cancer_type_Skin Cancer', 'tissue_name_BONE', 'tissue_name_BRAIN', 'tissue_name_BREAST', 'tissue_name_ENDOMETRIUM', 'tissue_name_HAEMATOPOIETIC_AND_LYMPHOID', 'tissue_name_KIDNEY', 'tissue_name_LARGE_INTESTINE', 'tissue_name_LIVER', 'tissue_name_LUNG', 'tissue_name_MALARIA', 'tissue_name_OVARY', 'tissue_name_PANCREAS', 'tissue_name_PROSTATE', 'tissue_name_SKIN', 'tissue_name_SOFT_TISSUE', 'tissue_name_STOMACH', 'tissue_name_URIN

In [15]:
import pickle

# Save to pickle
drugcomb_encoded.to_pickle("drugcomb_preprocessed.pkl")

print("Preprocessed data saved as pickle!")


Preprocessed data saved as pickle!
