# 🧪 Ligand-Based Drug Discovery for HIV Reverse Transcriptase
This notebook shows how to collect bioactivity data from ChEMBL, preprocess it, and build a machine learning model to predict active compounds.

In [None]:
!pip install chembl_webresource_client pandas scikit-learn rdkit-pypi



In [None]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [None]:
# Search for HIV Reverse Transcriptase
selected_target = 'CHEMBL284'

In [16]:
# Download activity data
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')
df = pd.DataFrame(res)
df.to_csv('hiv_bioactivity_data.csv', index=False)
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,105501,[],CHEMBL666573,In vitro inhibition of human Dipeptidylpeptida...,B,,,BAO_0000190,...,Homo sapiens,Dipeptidyl peptidase IV,9606,,,IC50,uM,UO_0000065,,217.0
1,,,106644,[],CHEMBL666573,In vitro inhibition of human Dipeptidylpeptida...,B,,,BAO_0000190,...,Homo sapiens,Dipeptidyl peptidase IV,9606,,,IC50,uM,UO_0000065,,41.0
2,,,106647,[],CHEMBL666573,In vitro inhibition of human Dipeptidylpeptida...,B,,,BAO_0000190,...,Homo sapiens,Dipeptidyl peptidase IV,9606,,,IC50,uM,UO_0000065,,15.0
3,,,106650,[],CHEMBL666573,In vitro inhibition of human Dipeptidylpeptida...,B,,,BAO_0000190,...,Homo sapiens,Dipeptidyl peptidase IV,9606,,,IC50,uM,UO_0000065,,500.0
4,,,108924,[],CHEMBL666573,In vitro inhibition of human Dipeptidylpeptida...,B,,,BAO_0000190,...,Homo sapiens,Dipeptidyl peptidase IV,9606,,,IC50,uM,UO_0000065,,188.0


In [17]:
# Preprocess
df = df[df.standard_value.notna()]
df = df[df.canonical_smiles.notna()]
df = df[['canonical_smiles', 'standard_value']]
df = df.drop_duplicates()
df['standard_value'] = df['standard_value'].astype(float)
df['bioactivity_class'] = df['standard_value'].apply(lambda x: 1 if x < 1000 else 0)
df.to_csv('hiv_bioactivity_clean.csv', index=False)
df.head()

Unnamed: 0,canonical_smiles,standard_value,bioactivity_class
0,N[C@@H](CC1CCCCC1)C(=O)N1CCCCC1,217000.0,0
1,C[C@H](N)C(=O)N1CCCC1,41000.0,0
2,O=C([C@@H]1CCCN1)N1CCCC1,15000.0,0
3,S=C(C1CCCN1)N1CCCC1,500000.0,0
4,NC(=O)CC(N)C(=O)N1CCCC1,188000.0,0


In [18]:
# Feature extraction
from rdkit import Chem
from rdkit.Chem import Descriptors

def calc_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol)
    ]

features = df['canonical_smiles'].apply(calc_features)
X = pd.DataFrame(features.tolist(), columns=['MolWt', 'LogP', 'HDonors', 'HAcceptors'])
y = df['bioactivity_class']

In [19]:
# Train Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.775178026449644
              precision    recall  f1-score   support

           0       0.69      0.50      0.58       303
           1       0.80      0.90      0.85       680

    accuracy                           0.78       983
   macro avg       0.74      0.70      0.71       983
weighted avg       0.77      0.78      0.76       983



In [20]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

# 🔁 Step 1: Give your test SMILES here
new_smiles_list = [
    "CC1=CN(C(=O)NC1=O)CO",  # Zidovudine
    "COC1=CC2=C(C=C1)N=C(N=C2NC3=CC=C(C=C3)F)OC",  # Gefitinib
    "CC(=O)OC1=CC=CC=C1C(=O)O"  # Aspirin
]

# 🧪 Step 2: Extract descriptors for each
new_features = []
for smi in new_smiles_list:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        feats = [
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.NumHAcceptors(mol)
        ]
        new_features.append(feats)
    else:
        new_features.append([None, None, None, None])  # Invalid SMILES

# ❓ Step 3: Predict activity
new_X = pd.DataFrame(new_features, columns=['MolWt', 'LogP', 'HDonors', 'HAcceptors'])
new_X = new_X.dropna()
predictions = model.predict(new_X)

# ✅ Step 4: Show results
for smi, pred in zip(new_smiles_list, predictions):
    status = "Active 👍" if pred == 1 else "Inactive 👎"
    print(f"SMILES: {smi} => Prediction: {status}")


SMILES: CC1=CN(C(=O)NC1=O)CO => Prediction: Inactive 👎
SMILES: COC1=CC2=C(C=C1)N=C(N=C2NC3=CC=C(C=C3)F)OC => Prediction: Inactive 👎
SMILES: CC(=O)OC1=CC=CC=C1C(=O)O => Prediction: Active 👍
