IMPORT FILES

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
from rdkit import Chem
from rdkit.Chem import Descriptors
import json
import pubchempy as pcp
from pubchempy import get_compounds
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import requests,json

DATASET IMPORTING 

In [3]:
merged = pd.read_csv(r"merged.csv")

# === Feature Extraction (RDKit Descriptors) ===

In [4]:

def extract_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [0]*8
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.RingCount(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.FractionCSP3(mol)
    ]

# === K-Fold Cross-Validation ===


In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_metrics = []
final_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)


# === Extracting and adjusting infos from Merged file  ===

In [6]:
# === Extract Descriptors ===
merged[['MolWt', 'MolLogP', 'TPSA', 'HDonors', 'HAcceptors', 'Rings', 'RotBonds', 'CSP3']] = \
    merged['SMILES_x'].apply(lambda x: pd.Series(extract_descriptors(x)))

# === Prepare Input/Output ===
X = merged[['MolWt', 'MolLogP', 'TPSA', 'HDonors', 'HAcceptors', 'Rings', 'RotBonds', 'CSP3']]

# If 'toxicity_label' exists, use it; otherwise, use the 6th column as the target
if 'toxicity_label' in merged.columns:
    y = merged['toxicity_label']
else:
    y = merged.iloc[:, 6]

# === Encode Labels ===
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Convert categorical labels to numbers

# === Normalize Features ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Apply SMOTE for Balancing ===
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y_encoded)

# Print to verify the transformation
print("Resampled Feature Shape:", X_resampled.shape)
print("Resampled Target Shape:", y_resampled.shape)
print("Unique Labels after Encoding:", set(y_resampled))  # Should contain only numeric values

# Save Label Encoder for Decoding in Future (if needed)
import joblib
joblib.dump(le, "label_encoder.pkl")


Resampled Feature Shape: (490, 8)
Resampled Target Shape: (490,)
Unique Labels after Encoding: {0, 1}


[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is greater than permitted
[12:59:50] Explicit valence for atom # 0 P, 7, is g

['label_encoder.pkl']

# ===MODEL TRAIN ===

In [7]:
for fold, (train_idx, test_idx) in enumerate(skf.split(X_resampled, y_resampled)):
    X_train, X_test = X_resampled[train_idx], X_resampled[test_idx]
    y_train, y_test = y_resampled[train_idx], y_resampled[test_idx]

    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)
    y_prob = final_model.predict_proba(X_test)[:, 1]

    metrics = {
        'Fold': fold+1,
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred, average='macro'),
        'ROC_AUC': roc_auc_score(y_test, y_prob)
    }
    all_metrics.append(metrics)

# === Save Model (Final Save After K-Fold Completion) ===
joblib.dump(final_model, "drug_toxicity_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model and scaler saved successfully after training.")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model and scaler saved successfully after training.


# === prediction Function

In [8]:
def predict_and_format(smiles, compound_name=""):
    desc = extract_descriptors(smiles)
    scaled = scaler.transform([desc])
    prob = final_model.predict_proba(scaled)[0]
    pred = final_model.predict(scaled)[0]

    result = {
        "Name": compound_name,
        "SMILES": smiles,
        "Predicted Properties": {
            "MolWt": desc[0],
            "MolLogP": desc[1],
            "TPSA": desc[2],
            "HDonors": desc[3],
            "HAcceptors": desc[4],
            "RingCount": desc[5],
            "Rotatable Bonds": desc[6],
            "FractionCSP3": desc[7]
        },
        "Toxicity Prediction": int(pred),
        "Toxicity Confidence": float(prob[1])
    }
    return json.dumps(result, indent=2)


# === TEST CASES : 

tesr case 1

In [9]:
print(predict_and_format("c1(c2nc(N=C(N)N)sc2)cn(c(c1)C)C", "CHEMBL153534"))

{
  "Name": "CHEMBL153534",
  "SMILES": "c1(c2nc(N=C(N)N)sc2)cn(c(c1)C)C",
  "Predicted Properties": {
    "MolWt": 235.31600000000003,
    "MolLogP": 1.36192,
    "TPSA": 82.22,
    "HDonors": 2,
    "HAcceptors": 4,
    "RingCount": 2,
    "Rotatable Bonds": 2,
    "FractionCSP3": 0.2
  },
  "Toxicity Prediction": 0,
  "Toxicity Confidence": 0.49974536895751953
}




Actual answer for test case 1 :

Molecular Weight: 235.31600000000003
logP: 1.36192
TPSA: 82.22
Hydrogen Donors: 2
Hydrogen Acceptors: 4
Ring Count: 2
Rotatable Bonds: 2
FractionCSP3:Â 0.2



test case 2 

In [10]:
print(predict_and_format("CCOc1ccc(cc1)C(=O)Nc2ccc(cc2)S(=O)(=O)N", "CHEMBL450221"))


{
  "Name": "CHEMBL450221",
  "SMILES": "CCOc1ccc(cc1)C(=O)Nc2ccc(cc2)S(=O)(=O)N",
  "Predicted Properties": {
    "MolWt": 320.37000000000006,
    "MolLogP": 1.9849999999999999,
    "TPSA": 98.49,
    "HDonors": 2,
    "HAcceptors": 4,
    "RingCount": 2,
    "Rotatable Bonds": 5,
    "FractionCSP3": 0.13333333333333333
  },
  "Toxicity Prediction": 0,
  "Toxicity Confidence": 0.49974536895751953
}




Actual answer for test case 2

"MolWt": 318.36,
"MolLogP": 2.1,
"TPSA": 92.5,
"HDonors": 2,
"HAcceptors": 6,
"RingCount": 2,
"Rotatable Bonds": 4,
"FractionCSP3": 0.15
Toxicity Prediction: 1
Toxicity Confidence: 0.7654

test case 3 

In [11]:
print(predict_and_format("CCN(CC)C(=O)Nc1ccc(cc1)N", "CHEMBL216625"))


{
  "Name": "CHEMBL216625",
  "SMILES": "CCN(CC)C(=O)Nc1ccc(cc1)N",
  "Predicted Properties": {
    "MolWt": 207.27700000000002,
    "MolLogP": 2.1424999999999996,
    "TPSA": 58.36,
    "HDonors": 2,
    "HAcceptors": 2,
    "RingCount": 1,
    "Rotatable Bonds": 3,
    "FractionCSP3": 0.36363636363636365
  },
  "Toxicity Prediction": 0,
  "Toxicity Confidence": 0.49974536895751953
}




Actual answer :

{
  "MolWt": 195.25,
  "MolLogP": 1.5,
  "TPSA": 60.4,
  "HDonors": 2,
  "HAcceptors": 3,
  "RingCount": 1,
  "Rotatable Bonds": 2,
  "FractionCSP3": 0.25
}
Toxicity Prediction: 0

Toxicity Confidence: 0.4321

test case 4

In [12]:
output="What is your analysis of this drug candidate,i want to know if it's safe for humans or animals/n?"
output+=str(predict_and_format("CCOc1ccc(cc1)C(=O)Nc2nccs2", "CHEMBL512003"))
output+="\nI don't want huge answer's just a quick summary ok"



actual answer :

{
  "MolWt": 236.26,
  "MolLogP": 1.9,
  "TPSA": 78.2,
  "HDonors": 2,
  "HAcceptors": 5,
  "RingCount": 2,
  "Rotatable Bonds": 3,
  "FractionCSP3": 0.12
}
Toxicity Prediction: 1

Toxicity Confidence: 0.8532

In [13]:
# Read the API key from api.txt
with open('api.txt', 'r') as file:
    API_KEY = file.read().strip()

def build_final_prompt(user_input, format_text=output):
    if not user_input:
        return format_text.strip()
    final_prompt = f"{format_text}\n{user_input}".strip()
    final_prompt = "\n".join([x for x in final_prompt.splitlines() if not x.strip().startswith("#") and x.strip() != ""])
    final_prompt = final_prompt.replace(" = ", "=").replace("= ", "=").replace(" =", "=")
    return final_prompt

if __name__ == "__main__":
    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={API_KEY}"
    headers = {"Content-Type": "application/json"}
    user_input = input("Enter input (or leave blank to use only output variable): ").strip()
    final_prompt = build_final_prompt(user_input)
    data = {"contents": [{"parts": [{"text": final_prompt}]}]}
    
    r = requests.post(url, headers=headers, data=json.dumps(data))
    
    try:
        print(r.json()["candidates"][0]["content"]["parts"][0]["text"])
    except KeyError:
        print("Error:", r.json())


Okay, here's a quick summary of the analysis:

Based on the provided data, the compound CHEMBL512003 looks to be a small, relatively lipophilic molecule with a reasonable number of hydrogen bond donors and acceptors. The **predicted toxicity is low** (0), but the **confidence in that prediction is moderate** (around 50%).

**Therefore, while initial data suggests low toxicity, further and more thorough safety testing is definitely required before considering it safe for humans or animals.** The moderate confidence score highlights the need for caution.

