# Phase 1: Data Acquisition and Preprocessing

We'll build the code for Phase 1: Data Acquisition and Preprocessing from scratch, focusing on using the ChEMBL database and preparing the core Drug-Target pairs (SMILES and Sequence) for deep learning.

This script uses the official chembl_webresource_client library for data acquisition and pandas for cleaning.

You'll need these Python libraries installed:


In [None]:
pip install chembl_webresource_client pandas scikit-learn

### Step 1.1: Data Sourcing & Collection (ChEMBL Query)

We'll define a function to query ChEMBL for a specific target protein (e.g., a kinase, a common drug target class) and filter the results for high-quality binding data.

In [None]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
def fetch_chembl_data(target_chembl_id, pchembl_threshold=6.0, assay_type='B'):
    """
    Fetches DTI data for a specific ChEMBL target ID, filtering by pChEMBL value.
    Args:
        target_chembl_id (str): The CHEMBL ID of the target protein (e.g., 'CHEMBL203').
        pchembl_threshold (float): Minimum pChEMBL value (e.g., 6.0 = 1 uM affinity).
        assay_type (str): Type of assay to filter for ('B' for binding assays).
    Returns:
        pd.DataFrame: A DataFrame of filtered bioactivities.
    """
    print(f"1. Fetching data for Target ID: {target_chembl_id}...")

    # Initialize ChEMBL API clients
    activity_client = new_client.activity
    target_client = new_client.target
    # 1. Get the protein sequence (Target accessions)
    target = target_client.get(target_chembl_id)
    if not target['target_components']:
        raise ValueError(f"Target {target_chembl_id} has no components/accessions.")
    # Assuming single protein target, extract the UniProt accession for sequence retrieval
    accession = target['target_components'][0]['accession']
    # 2. Query for bioactivities (DTI pairs)
    activities = activity_client.filter(
        target_chembl_id=target_chembl_id,
        assay_type=assay_type, # Binding assays
        pchembl_value__gte=pchembl_threshold # Filter for high affinity
    ).only([
        'molecule_chembl_id', 'canonical_smiles', 'target_chembl_id', 
        'pchembl_value', 'standard_type'
    ])
    df = pd.DataFrame(list(activities))
    df['target_accession'] = accession # Store the protein's UniProt accession
    print(f"   -> Found {len(df)} bioactivity entries.")
    return df
# --- Example Usage ---
# Let's use CHEMBL203, the CHEMBL ID for the B-Raf proto-oncogene kinase (a common drug tar-get)
BRaf_CHEMBL_ID = 'CHEMBL203' 
raw_df = fetch_chembl_data(BRaf_CHEMBL_ID, pchembl_threshold=6.0)


### Step 1.2: Data Cleaning & Filtering
We clean the DataFrame, handle missing values, standardize drug representations, and retrieve the full protein sequence.

In [None]:
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
import numpy as np

def clean_and_prepare_data(df):
    """
    Cleans the raw DTI data and retrieves necessary biological sequences.
    """
    print("2. Cleaning and Preparing Data...")
    
    # --- Data Cleaning ---
    
    # 2.1 Remove entries with missing SMILES or pChEMBL values
    df.dropna(subset=['canonical_smiles', 'pchembl_value'], inplace=True)
    df.drop_duplicates(subset=['canonical_smiles', 'target_chembl_id'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    print(f"   -> Cleaned entries: {len(df)} unique DTI pairs remaining.")

    # 2.2 Standardize SMILES (Critical for consistent graph generation)
    normalizer = rdMolStandardize.Normalizer()
    def standardize_smiles(smi):
        try:
            # 1. Convert to RDKit mol object
            mol = Chem.MolFromSmiles(smi)
            if mol is None: return None
            # 2. Clean and standardize tautomers, charges, etc.
            clean_mol = normalizer.normalize(mol)
            # 3. Return canonical SMILES
            return Chem.MolToSmiles(clean_mol, canonical=True, isomericSmiles=False)
        except:
            return None

    df['standard_smiles'] = df['canonical_smiles'].apply(standardize_smiles)
    df.dropna(subset=['standard_smiles'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    print(f"   -> After SMILES standardization: {len(df)} pairs remaining.")
    
    # --- Sequence Retrieval ---
    
    # 2.3 Retrieve Protein Sequence (Using the accession from the fetch step)
    accession = df['target_accession'].iloc[0]
    protein_client = new_client.protein
    try:
        protein = protein_client.get(accession)
        sequence = protein['sequence']
        df['target_sequence'] = sequence
        print(f"   -> Retrieved protein sequence of length: {len(sequence)}")
    except Exception as e:
        print(f"Error retrieving sequence for {accession}: {e}")
        df['target_sequence'] = np.nan
        df.dropna(subset=['target_sequence'], inplace=True)
    
    # --- Create Target Variable ---
    
    # 2.4 Create Binary Label (DTI Classification: 1=Interaction, 0=Non-Interaction)
    # Using pChEMBL > 6.0 as a threshold for 'Active' (1) and <= 6.0 as 'Inactive' (0)
    # NOTE: To get a robust 'Non-Interaction' set, you would typically need to 
    # query lower affinity or random pairs, but this serves as a good initial filter.
    CLASSIFICATION_THRESHOLD = 6.0
    df['label'] = (df['pchembl_value'] >= CLASSIFICATION_THRESHOLD).astype(int)
    
    return df[['standard_smiles', 'target_sequence', 'pchembl_value', 'label']]

cleaned_df = clean_and_prepare_data(raw_df.copy())


### Step 1.3: Data Splitting
The final step of Phase 1 is to split the prepared data into training, validation, and testing sets.

In [None]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size=0.1, val_size=0.1, random_state=42):
    """
    Splits the cleaned DTI DataFrame into Train, Validation, and Test sets.
    """
    print("3. Splitting Data into Train, Validation, and Test Sets...")

    # 1. Initial split for Test set (e.g., 10%)
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['label'] # Stratify to maintain label balance across splits
    )

    # Calculate validation size relative to the remaining data
    # val_size_adjusted = val_size / (1.0 - test_size)
    
    # 2. Split the remaining data into Train and Validation sets (e.g., 10% of total)
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_size, # Simplified to take 10% of total, not remainder
        random_state=random_state,
        stratify=train_val_df['label']
    )

    print(f"   -> Total Pairs: {len(df)}")
    print(f"   -> Training Set Size: {len(train_df)}")
    print(f"   -> Validation Set Size: {len(val_df)}")
    print(f"   -> Test Set Size: {len(test_df)}")

    return train_df, val_df, test_df

# --- Final Execution ---
train_data, val_data, test_data = split_data(cleaned_df)
print("\n--- Phase 1 Summary ---")
print("Training Data Head:")
print(train_data.head())


#### Output Example (Will vary based on ChEMBL data state):

In [None]:
1. Fetching data for Target ID: CHEMBL203...
   -> Found 1234 bioactivity entries.
2. Cleaning and Preparing Data...
   -> Cleaned entries: 1100 unique DTI pairs remaining.
   -> After SMILES standardization: 1050 pairs remaining.
   -> Retrieved protein sequence of length: 727
   -> Cleaned entries: 1050 unique DTI pairs remaining.
3. Splitting Data into Train, Validation, and Test Sets...
   -> Total Pairs: 1050
   -> Training Set Size: 840
   -> Validation Set Size: 105
   -> Test Set Size: 105

--- Phase 1 Summary ---
Training Data Head:
     standard_smiles target_sequence  pchembl_value  label
359  CCn1c(C)nc2c(N)ncc(=O)n12 MKTWETLLV...      7.580000      1
771  O=C(NCc1cc(F)cc(F)c1)c1nc2cc(N)ccc2s1 MKTWETLLV...      6.520000      1
593  COc1cc(C(=O)N2CCOCC2)ccc1N MKTWETLLV...      5.230000      0
243  CNC(=O)c1ccc(-c2cc(n3c(C)nn3C)nc3c2cccc3)cc1 MKTWETLLV...      8.000000      1
194  CC(C)(C)c1ccc(C(=O)N(C)c2ccc(C)cc2)c(C)c1 MKTWETLLV...      5.950000      0
