In [1]:
pip install pytdc

Collecting pytdc
  Downloading pytdc-1.1.1.tar.gz (146 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/146.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.8/146.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate==0.33.0 (from pytdc)
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting biopython<2.0,>=1.78 (from pytdc)
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting dataclasses<1.0,>=0.6 (from pytdc)
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting datasets==2.20.0 (from pytdc)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.2 (from pytdc)
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting fuzzywuzzy<1.0,>=0.18.0 (from pytdc)
  Downloading fuzz

In [1]:
from tdc.single_pred import Tox
from sklearn import svm

data = Tox(name = 'hERG_Karim')
df = data.get_data()
df

Found local copy...
Loading...
Done!


Unnamed: 0,Drug_ID,Drug,Y
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0
...,...,...,...
13440,13440,Cc1csc(NC(=O)c2sc3nc4c(c(C(F)(F)F)c3c2N)CCC4)n1,0
13441,13441,Cc1cccc(-c2n[nH]cc2-c2ccc3ncccc3n2)n1,0
13442,13442,Cc1ccccc1-n1c(Cn2cnc3c(N)ncnc32)nc2cccc(C)c2c1=O,0
13443,13443,Cc1ccccc1-n1c(Cn2ncc3c(N)ncnc32)nc2cccc(C)c2c1=O,0


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors

split = data.get_split()
smiles_train = split['train']['Drug']
Y_train = split['train']['Y']
smiles_valid = split['valid']['Drug']
Y_valid = split['valid']['Y']

# Function to compute Morgan fingerprint
def compute_morgan_fingerprint(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    return np.array(fingerprint)

# Function to compute MACCS fingerprint
def compute_maccs_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    maccs = MACCSkeys.GenMACCSKeys(mol)
    return np.array(maccs)

# Function to compute additional molecular descriptors
def compute_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    descriptors = []
    descriptors.append(Descriptors.MolWt(mol))  # Molecular weight
    descriptors.append(Descriptors.MolLogP(mol))  # LogP
    descriptors.append(Descriptors.TPSA(mol))  # Topological Polar Surface Area
    descriptors.append(Descriptors.NumRotatableBonds(mol))  # Rotatable bonds
    descriptors.append(Descriptors.NumAromaticRings(mol))  # Aromatic rings
    descriptors.append(Descriptors.NumHDonors(mol))  # Hydrogen bond donors
    descriptors.append(Descriptors.NumHAcceptors(mol))  # Hydrogen bond acceptors

    return np.array(descriptors)

# Function to compute combined features
def compute_combined_fingerprints(smiles):
    morgan_fingerprint = compute_morgan_fingerprint(smiles)
    maccs_fingerprint = compute_maccs_fingerprint(smiles)
    molecular_descriptors = compute_molecular_descriptors(smiles)
    combined_features = np.concatenate([morgan_fingerprint, maccs_fingerprint, molecular_descriptors])
    return combined_features

# Compute combined features for training data
train_features = smiles_train.apply(compute_combined_fingerprints)
X_train_combined = np.stack(train_features.values)

# Compute combined features for validation data
valid_features = smiles_valid.apply(compute_combined_fingerprints)
X_valid_combined = np.stack(valid_features.values)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_valid_scaled = scaler.transform(X_valid_combined)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, Y_train)

# Predict and evaluate Random Forest model
y_pred_rf = rf_model.predict(X_valid_scaled)
print("Random Forest Accuracy:", accuracy_score(Y_valid, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(Y_valid, y_pred_rf))

Random Forest Accuracy: 0.8638392857142857
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.86      0.87       683
           1       0.86      0.87      0.86       661

    accuracy                           0.86      1344
   macro avg       0.86      0.86      0.86      1344
weighted avg       0.86      0.86      0.86      1344



In [6]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors#used
from rdkit.Chem import rdmolops


#data conversion
split = data.get_split()
smiles_train = split['train']['Drug']
Y_train = split['train']['Y']
smiles_valid = split['valid']['Drug']
Y_valid = split['valid']['Y']

#Compute Morgan fingerprints
# Compute MACCS Keys
def compute_maccs_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    maccs = MACCSkeys.GenMACCSKeys(mol)
    return np.array(maccs)

# Compute additional molecular descriptors
def compute_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    descriptors = []
    # Molecular weight
    descriptors.append(Descriptors.MolWt(mol))
    # LogP (octanol-water partition coefficient)
    descriptors.append(Descriptors.MolLogP(mol))
    # Topological Polar Surface Area (TPSA)
    descriptors.append(Descriptors.TPSA(mol))
    # Number of rotatable bonds
    descriptors.append(Descriptors.NumRotatableBonds(mol))
    # Aromaticity (True/False)
    descriptors.append(Descriptors.NumAromaticRings(mol))
    # Hydrogen Bond Donors and Acceptors
    descriptors.append(Descriptors.NumHDonors(mol))
    descriptors.append(Descriptors.NumHAcceptors(mol))

    return np.array(descriptors)

# Compute fingerprints
def compute_combined_fingerprints(smiles):
    # Compute Morgan Fingerprint
    morgan_fingerprint = compute_morgan_fingerprint(smiles)

    # Compute MACCS Fingerprint
    maccs_fingerprint = compute_maccs_fingerprint(smiles)

    # Compute molecular descriptors
    molecular_descriptors = compute_molecular_descriptors(smiles)

    # Combine all features into a single vector
    combined_features = np.concatenate([morgan_fingerprint, maccs_fingerprint, molecular_descriptors])

    return combined_features

def compute_morgan_fingerprint(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    return np.array(fingerprint)

# Compute fingerprints

train_features = smiles_train.apply(compute_combined_fingerprints)
X_train_combined = np.stack(train_features.values)

# Compute features for validation data
valid_features = smiles_valid.apply(compute_combined_fingerprints)
X_valid_combined = np.stack(valid_features.values)

# Scale the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_valid_scaled = scaler.transform(X_valid_combined)
# Train SVM model

svm_model = SVC(kernel='rbf', random_state=42)#do rbf 0.8288690476190477 or poly 0.828125

svm_model.fit(X_train_scaled, Y_train)
# Predict and evaluate SVM model

y_pred_svm = svm_model.predict(X_valid_scaled)
print("SVM Accuracy:", accuracy_score(Y_valid, y_pred_svm))
print("SVM Classification Report:\n", classification_report(Y_valid, y_pred_svm))

SVM Accuracy: 0.8288690476190477
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83       683
           1       0.82      0.84      0.83       661

    accuracy                           0.83      1344
   macro avg       0.83      0.83      0.83      1344
weighted avg       0.83      0.83      0.83      1344

