<a href="https://colab.research.google.com/github/alisha-17kakkar/hERG---project/blob/main/hERG---project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdkit-pypi pandas scikit-learn matplotlib chembl_webresource_client

In [None]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
import os
filename = "herg_activity_data.csv"
if os.path.exists(filename):
    print(f"Loading data from local file: {filename} ...")
    df = pd.read_csv(filename)
    print("Data loaded instantly!")
else:
    print(f"Local file not found. Downloading data from ChEMBL for the first time...")
    print("(This might take a minute or two, but will only happen once.)")
    target = new_client.target
    target_query = target.search('herg')
    targets = pd.DataFrame.from_dict(target_query)
    human_herg_target = targets[targets['target_chembl_id'] == 'CHEMBL240']
    target_id = human_herg_target.target_chembl_id.values[0]
    activity = new_client.activity
    res = activity.filter(target_chembl_id=target_id).filter(standard_type="IC50")
    df = pd.DataFrame.from_dict(res)
    df.to_csv(filename, index=False)
    print(f"Data downloaded and saved to {filename} for fast future access.")
print("-----------------------------------------")
print(f"Shape of the dataset: {df.shape}")
df.head()

In [None]:
df_clean = df[['canonical_smiles', 'standard_value']]
df_clean = df_clean.dropna()
df_clean['standard_value'] = pd.to_numeric(df_clean['standard_value'], errors='coerce')
df_clean = df_clean.dropna()
print(f"Shape after cleaning: {df_clean.shape}")
df_clean.head()

In [None]:
bioactivity_class = []
for value in df_clean['standard_value']:
  if float(value) <= 10000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("inactive")
df_clean['bioactivity_class'] = bioactivity_class
print(f"Final shape of our dataset: {df_clean.shape}")
print("\nValue counts for each class:")
print(df_clean['bioactivity_class'].value_counts())
df_clean.head()

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas()
def generate_fingerprint(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        return list(fp)
    else:
        return None

df_clean['fingerprint'] = df_clean['canonical_smiles'].progress_apply(generate_fingerprint)

df_final = df_clean.dropna(subset=['fingerprint']).copy()
X = np.array(df_final['fingerprint'].tolist())

y = df_final['bioactivity_class']

print("\nFingerprint generation complete!")
print("Shape of our features (X):", X.shape)
print("Shape of our target (y):", y.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training complete!")


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['active', 'inactive'], yticklabels=['active', 'inactive'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()