label
0    7408
1     423
Name: count, dtype: int64
------------------
label
0    7408
1     423
Name: count, dtype: int64


label
0    7408
1     423
Name: count, dtype: int64
label
0    7408
1    7408
Name: count, dtype: int64


In [65]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from minisom import MiniSom

# Load dataset
file_path = r"C:\Users\Sushma\Desktop\Acadamics\4th sem\BIO-2\project\tox21.csv"
data = pd.read_csv(file_path)
print(data['label'].value_counts())

# Remove duplicates
data = data.drop_duplicates(subset='smiles').reset_index(drop=True)

# Optional: Reset index
data = data.reset_index(drop=True)
print("------------------")
print(data['label'].value_counts())
from sklearn.utils import resample

majority = data[data.label == 0]
minority = data[data.label == 1]

minority_oversampled = resample(minority, 
                                replace=True, 
                                n_samples=len(majority), 
                                random_state=42)

data_balanced = pd.concat([majority, minority_oversampled]).reset_index(drop=True)
print(data_balanced['label'].value_counts())


# Preprocessing SMILES strings to molecular descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return {
            "MolecularWeight": Descriptors.MolWt(mol),
            "NumAtoms": Descriptors.HeavyAtomCount(mol),
            "NumRotatableBonds": Descriptors.NumRotatableBonds(mol),
            "LogP": Descriptors.MolLogP(mol),
        }
    else:
        return {"MolecularWeight": np.nan, "NumAtoms": np.nan, "NumRotatableBonds": np.nan, "LogP": np.nan}

# Apply descriptor calculation
descriptors = data_balanced["smiles"].apply(calculate_descriptors)
descriptors_df = pd.DataFrame(descriptors.tolist())

# Combine descriptors with labels and remove NaNs
data_balanced = pd.concat([descriptors_df, data_balanced["label"]], axis=1).dropna()

# Split dataset into features and target
X = data_balanced.drop("label", axis=1)
y = data_balanced["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "SVM": SVC(kernel="rbf", probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=20),
    "RandomForest": RandomForestClassifier(n_estimators=1, random_state=42)
}

# Train and evaluate traditional models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print(f"\nModel: {name}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["Non-Toxic", "Toxic"]))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    if y_proba is not None:
        print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.2f}")

# Self-Organizing Neural Network (SNN) using MiniSom
print("\nModel: Self-Organizing Neural Network (SNN)")

# Define and train the SOM
som = MiniSom(x=10, y=10, input_len=X_train.shape[1], sigma=1.0, learning_rate=0.7, random_seed=42)
som.train_random(X_train, num_iteration=1000)

# Function to extract SOM features
def get_som_features(som, data):
    som_x, som_y = som._weights.shape[:2]  # Get SOM grid dimensions
    feature_vector = np.zeros((data.shape[0], som_x * som_y))
    
    for i, sample in enumerate(data):
        x, y = som.winner(sample)  # Get SOM cluster coordinates
        feature_vector[i, x * som_y + y] = 1  # One-hot encode cluster
    
    return feature_vector

# Transform train and test sets into SOM feature space
X_train_som = get_som_features(som, X_train)
X_test_som = get_som_features(som, X_test)

# Train a classifier on the extracted features
rf_on_snn = RandomForestClassifier(n_estimators=100, random_state=42)
rf_on_snn.fit(X_train_som, y_train)

# Evaluate SNN-based model
y_pred_snn = rf_on_snn.predict(X_test_som)
print("Classification Report:")
print(classification_report(y_test, y_pred_snn, target_names=["Non-Toxic", "Toxic"]))
print(f"Accuracy: {accuracy_score(y_test, y_pred_snn):.2f}")

label
0    7408
1     423
Name: count, dtype: int64
------------------
label
0    7408
1     423
Name: count, dtype: int64
label
0    7408
1    7408
Name: count, dtype: int64


[20:29:41] Explicit valence for atom # 5 Al, 6, is greater than permitted
[20:29:43] Explicit valence for atom # 3 Al, 6, is greater than permitted
[20:29:43] Explicit valence for atom # 4 Al, 6, is greater than permitted
[20:29:43] Explicit valence for atom # 9 Al, 6, is greater than permitted
[20:29:45] Explicit valence for atom # 4 Al, 6, is greater than permitted
[20:29:45] Explicit valence for atom # 14 Al, 6, is greater than permitted
[20:29:49] Explicit valence for atom # 8 Al, 6, is greater than permitted
[20:29:51] Explicit valence for atom # 20 Al, 6, is greater than permitted



Model: SVM
Classification Report:
              precision    recall  f1-score   support

   Non-Toxic       0.72      0.71      0.71      1476
       Toxic       0.71      0.72      0.72      1486

    accuracy                           0.71      2962
   macro avg       0.71      0.71      0.71      2962
weighted avg       0.71      0.71      0.71      2962

Accuracy: 0.71
ROC-AUC Score: 0.78

Model: KNN
Classification Report:
              precision    recall  f1-score   support

   Non-Toxic       0.91      0.65      0.76      1476
       Toxic       0.73      0.94      0.82      1486

    accuracy                           0.80      2962
   macro avg       0.82      0.80      0.79      2962
weighted avg       0.82      0.80      0.79      2962

Accuracy: 0.80
ROC-AUC Score: 0.89

Model: RandomForest
Classification Report:
              precision    recall  f1-score   support

   Non-Toxic       1.00      0.92      0.96      1476
       Toxic       0.92      1.00      0.96      1486

In [29]:
print(data.describe())



       MolecularWeight     NumAtoms  NumRotatableBonds         LogP  \
count      7823.000000  7823.000000        7823.000000  7823.000000   
mean        276.144155    18.566918           4.302953     2.373943   
std         164.732356    11.309542           4.464812     2.304307   
min           9.012000     1.000000           0.000000   -17.406400   
25%         165.236000    11.000000           1.000000     1.149350   
50%         240.302000    16.000000           3.000000     2.365500   
75%         343.044000    23.000000           6.000000     3.653150   
max        1877.664000   132.000000          47.000000    22.611800   

             label  
count  7823.000000  
mean      0.054071  
std       0.226173  
min       0.000000  
25%       0.000000  
50%       0.000000  
75%       0.000000  
max       1.000000  


In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils import resample

# Load the dataset
data = pd.read_csv( r"C:\Users\Sushma\Desktop\Acadamics\4th sem\BIO-2\project\tox21.csv")

# Drop duplicates and NaNs if necessary
data = data.drop_duplicates(subset="smiles").dropna(subset=["smiles", "label"]).reset_index(drop=True)

# Balance the dataset using oversampling
majority = data[data['label'] == 0]
minority = data[data['label'] == 1]

minority_oversampled = resample(minority, 
                                replace=True, 
                                n_samples=len(majority), 
                                random_state=42)

data_balanced = pd.concat([majority, minority_oversampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# Encode SMILES strings to fixed-length numeric arrays
def smiles_to_placeholder(smiles_list, max_length=100):
    encoded = []
    for smiles in smiles_list:
        numeric = [ord(char) for char in smiles[:max_length]]
        padded = numeric + [0] * (max_length - len(numeric)) if len(numeric) < max_length else numeric
        encoded.append(padded)
    return np.array(encoded)

# Encode and prepare features and labels
X = smiles_to_placeholder(data_balanced['smiles'])
y = data_balanced['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize individual models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
svm = SVC(probability=True)

# Fit individual models
rf.fit(X_train, y_train)
knn.fit(X_train, y_train)
svm.fit(X_train, y_train)

# Create and train ensemble VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf),
    ('knn', knn),
    ('svm', svm)
], voting='soft', weights=[0.5, 0.3, 0.3])

ensemble_model.fit(X_train, y_train)

# Predict and evaluate
final_preds = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, final_preds)
f1 = f1_score(y_test, final_preds)

print(f"Ensemble Model Accuracy: {accuracy:.4f}")
print(f"Ensemble Model F1 Score: {f1:.4f}")
print(classification_report(y_test, final_preds))


Ensemble Model Accuracy: 0.9723
Ensemble Model F1 Score: 0.9725
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      1516
           1       0.95      1.00      0.97      1448

    accuracy                           0.97      2964
   macro avg       0.97      0.97      0.97      2964
weighted avg       0.97      0.97      0.97      2964

