In [16]:
import json
import pickle
import numpy as np
from collections import OrderedDict

def create_affinity_csv(
    dataset_path: str,
    output_csv: str,
    use_train_test_split: bool = False,
    fold_type: str = 'train',  # or 'test'
    dataset_name: str = 'davis'  
):
    # Load ligands and proteins with order preserved
    ligands = json.load(open(f"{dataset_path}/ligands_can.txt"), object_pairs_hook=OrderedDict)
    drugs = list(ligands.values())
    drug_ids = list(ligands.keys())

    proteins = json.load(open(f"{dataset_path}/proteins.txt"), object_pairs_hook=OrderedDict)
    targets = list(proteins.values())
    target_ids = list(proteins.keys())

    # Load affinity matrix
    affinity = pickle.load(open(f"{dataset_path}/Y", "rb"), encoding='latin1')
    if dataset_name.lower() == 'davis':
        print(affinity)
        affinity = -np.log10(np.array(affinity) / 1e9)
        print(affinity)
    else:
        affinity = np.array(affinity)

    # Identify all non-NaN pairs
    all_rows, all_cols = np.where(~np.isnan(affinity))

    if use_train_test_split:
        if fold_type == 'train':
            folds = json.load(open(f"{dataset_path}/folds/train_fold_setting1.txt"))
        elif fold_type == 'test':
            folds = json.load(open(f"{dataset_path}/folds/test_fold_setting1.txt"))
        else:
            raise ValueError("fold_type must be 'train' or 'test'")

        if any(isinstance(el, list) for el in folds):
            folds = [idx for sublist in folds for idx in sublist]

        rows, cols = all_rows[folds], all_cols[folds]
    else:
        rows, cols = all_rows, all_cols

    # Write CSV
    with open(output_csv, 'w') as f:
        f.write("drug_id,compound_iso_smiles,protein_id,target_sequence,affinity\n")
        for i in range(len(rows)):
            row = rows[i]
            col = cols[i]
            smi = drugs[row]
            seq = targets[col]
            aff = affinity[row, col]
            drug_id = drug_ids[row]
            target_id = target_ids[col]
            f.write(f"{drug_id},{smi},{target_id},{seq},{aff}\n")

    print(f"CSV written to {output_csv}")


# Example usage
create_affinity_csv(
    dataset_path="davis",
    output_csv="davis_nonGraph_train.csv",
    use_train_test_split=True,  # set to True if you want only train or test data
    fold_type="train",
    dataset_name="davis"
)


[[4.3e+01 1.0e+04 1.0e+04 ... 2.3e+02 1.0e+04 1.0e+04]
 [1.0e+04 1.0e+04 1.0e+04 ... 2.0e+03 1.0e+04 1.0e+04]
 [1.0e+04 7.5e+01 1.9e+00 ... 1.2e+02 2.3e+00 1.0e+04]
 ...
 [1.0e+04 1.3e+01 7.7e+02 ... 9.8e+02 5.1e+03 1.0e+04]
 [6.3e+01 6.3e+01 6.9e+03 ... 5.2e+00 1.0e+04 3.5e+03]
 [1.0e+04 1.0e+04 1.0e+04 ... 1.9e+03 4.4e+03 1.0e+04]]
[[7.36653154 5.         5.         ... 6.63827216 5.         5.        ]
 [5.         5.         5.         ... 5.69897    5.         5.        ]
 [5.         7.12493874 8.7212464  ... 6.92081875 8.63827216 5.        ]
 ...
 [5.         7.88605665 6.11350927 ... 6.00877392 5.29242982 5.        ]
 [7.20065945 7.20065945 5.16115091 ... 8.28399666 5.         5.45593196]
 [5.         5.         5.         ... 5.7212464  5.35654732 5.        ]]
CSV written to davis_nonGraph_train.csv


In [12]:
import csv
import json
from collections import OrderedDict

def create_affinity_csv_from_pharos_csv(
    pharos_csv: str,
    dataset_path: str,
    output_csv: str
):
    # Load ligands and proteins with order preserved
    ligands = json.load(open(f"{dataset_path}/ligands_can.txt"), object_pairs_hook=OrderedDict)
    proteins = json.load(open(f"{dataset_path}/proteins.txt"), object_pairs_hook=OrderedDict)

    # Create reverse lookup: SMILES → ID, sequence → ID
    smiles_to_id = {v: k for k, v in ligands.items()}
    sequence_to_id = {v: k for k, v in proteins.items()}

    matched = 0
    unmatched = 0

    with open(pharos_csv, newline='') as infile, open(output_csv, 'w', newline='') as outfile:
        reader = csv.DictReader(infile)
        writer = csv.writer(outfile)
        writer.writerow(["drug_id", "compound_iso_smiles", "protein_id", "target_sequence", "affinity"])

        for row in reader:
            smi = row["compound_iso_smiles"]
            seq = row["target_sequence"]
            aff = row["affinity"]

            drug_id = smiles_to_id.get(smi)
            target_id = sequence_to_id.get(seq)

            if drug_id is not None and target_id is not None:
                writer.writerow([drug_id, smi, target_id, seq, aff])
                matched += 1
            else:
                unmatched += 1

    print(f"CSV written to {output_csv}")
    print(f"Matched: {matched} pairs, Unmatched: {unmatched} rows")

# Example usage:
create_affinity_csv_from_pharos_csv(
    pharos_csv="../pharos.csv",
    dataset_path="pharos",
    output_csv="pharos_with_ids.csv"
)
0

CSV written to pharos_with_ids.csv
Matched: 495 pairs, Unmatched: 0 rows


0

In [24]:
from rdkit import Chem
import csv
import json
from collections import OrderedDict

def canonicalize_smiles(smiles: str) -> str:
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(mol, canonical=True) if mol else None

def create_affinity_csv(input_csv: str, dataset_path: str, output_csv: str):
    ligands = json.load(open(f"{dataset_path}/ligands_can.txt"), object_pairs_hook=OrderedDict)
    proteins = json.load(open(f"{dataset_path}/proteins.txt"), object_pairs_hook=OrderedDict)

    # Use canonical SMILES for lookup keys
    smiles_to_id = {canonicalize_smiles(v): k for k, v in ligands.items()}
    sequence_to_id = {v.strip(): k for k, v in proteins.items()}

    matched = 0
    unmatched = 0

    with open(input_csv, newline='', encoding='utf-8') as infile, open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        writer = csv.writer(outfile)
        writer.writerow(["drug_id", "compound_iso_smiles", "protein_id", "target_sequence", "affinity"])

        for row in reader:
            smi_raw = row["compound_iso_smiles"].strip()
            seq = row["target_sequence"].strip()
            aff = row.get("affinity", "").strip()

            smi = canonicalize_smiles(smi_raw)
            drug_id = smiles_to_id.get(smi)
            target_id = sequence_to_id.get(seq)

            if drug_id is not None and target_id is not None:
                writer.writerow([drug_id, smi_raw, target_id, seq, aff])
                matched += 1
            else:
                unmatched += 1
                print(f"Unmatched row: Canonical SMILES={smi}, Seq={seq}")

    print(f"CSV written to {output_csv}")
    print(f"Matched: {matched} pairs, Unmatched: {unmatched} rows")


# Example usage:
create_affinity_csv(
    input_csv="../davis_blinding/data/davis_b_test.csv",
    dataset_path="davis",
    output_csv="davis_b_test_with_ids.csv"
)

CSV written to davis_b_test_with_ids.csv
Matched: 68 pairs, Unmatched: 0 rows
