<a href="https://colab.research.google.com/github/aliu-7/Molecular-Property-Prediction-and-Optimization/blob/main/4_3_2_Random_Forest_Classification_on_Molecular_Descriptors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Install RDKit (if not already installed)

In [None]:
!pip install -q rdkit pandas scikit-learn

# Step 2: Import libraries

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load BBBP dataset from GitHub

In [None]:
# Load BBBP dataset from GitHub
url = "https://raw.githubusercontent.com/Data-Chemist-Handbook/Data-Chemist-Handbook.github.io/refs/heads/master/_pages/BBBP.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,num,name,p_np,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,5,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


# Step 4: Define a function to compute descriptors

In [None]:
# Step 4: Feature Extraction with RDKit (safe handling)
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return {
            'MolWt': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'NumHDonors': Descriptors.NumHDonors(mol),
            'NumHAcceptors': Descriptors.NumHAcceptors(mol),
            'TPSA': Descriptors.TPSA(mol),
            'NumRotatableBonds': Descriptors.NumRotatableBonds(mol)
        }
    else:
        return None

# Apply descriptor function
descriptor_data = data['smiles'].apply(compute_descriptors)

# Filter out failed SMILES rows
valid_mask = descriptor_data.notnull()
df_desc = pd.DataFrame(descriptor_data[valid_mask].tolist())
df_desc['Label'] = data['p_np'][valid_mask].values

[22:57:29] Explicit valence for atom # 1 N, 4, is greater than permitted
[22:57:29] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 11 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 12 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:57:30] Explicit valence for atom # 5 N, 4, is greater than permitted


# Step 5: Train/Test Split and Model Training

In [None]:
# Split into input features and labels
X = df_desc.drop('Label', axis=1)
y = df_desc['Label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 6: Model Evaluation

In [None]:
# Make predictions
y_pred = rf.predict(X_test)

# Evaluate accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8431372549019608

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.55      0.63        99
           1       0.87      0.94      0.90       309

    accuracy                           0.84       408
   macro avg       0.80      0.74      0.76       408
weighted avg       0.84      0.84      0.83       408

