In [1]:
import warnings
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Suppress deprecation warnings (optional)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load dataset
data = pd.read_csv('drug_drug.csv')  # Replace with correct path if needed

# Function to calculate Morgan fingerprints
def calculate_morgan_fingerprint(smiles, radius=2, nBits=1024):
    if isinstance(smiles, str):
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
            return list(fingerprint)
    return [0] * nBits  # Return zeros for invalid SMILES

# Apply fingerprint calculation
data['molecular_fingerprints'] = data['SMILES'].apply(calculate_morgan_fingerprint)




In [2]:
# Extract features and labels
X = pd.DataFrame(data['molecular_fingerprints'].tolist())
y = data['Side Effects'].fillna("unknown").apply(lambda s: 1 if "severe" in str(s).lower() else 0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Convert numpy arrays to torch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Define a simple feedforward neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Initialize model, loss function, and optimizer
input_size = X_train.shape[1]
model = SimpleNN(input_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 5/20, Loss: 0.4607
Epoch 10/20, Loss: 0.2547
Epoch 15/20, Loss: 0.0993
Epoch 20/20, Loss: 0.0256


In [4]:
# Evaluation
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    predicted_classes = (predictions > 0.5).float()
    acc = accuracy_score(y_test_tensor, predicted_classes)
    print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 1.0000


In [6]:
# Function to predict side effect for a new SMILES
# Helper to get canonical SMILES
def canonicalize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(mol) if mol else None

# Updated prediction function
def predict_side_effect(smiles, model, scaler, dataset):
    fingerprint = calculate_morgan_fingerprint(smiles)
    X_new = pd.DataFrame([fingerprint])
    X_new_scaled = scaler.transform(X_new)
    X_tensor = torch.tensor(X_new_scaled, dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        prediction = model(X_tensor)
        score = prediction.item()
        result = "Severe Side Effect" if score > 0.5 else "Not Severe"

        print(f"\nPrediction for SMILES: {smiles}")
        print(f"Severity Score: {score:.4f}")
        print(f"Model Prediction: {result}")

        # Canonicalize input and dataset SMILES
        input_canonical = canonicalize(smiles)
        dataset['canonical'] = dataset['SMILES'].apply(canonicalize)
        
        # Search by canonical SMILES
        matched = dataset[dataset['canonical'] == input_canonical]
        if not matched.empty:
            side_effect_info = matched['Side Effects'].values[0]
            print(f"Known Side Effects: {side_effect_info}")
        else:
            print("No known side effects found in dataset for this compound.")



# Example usage
new_smiles = "O=C(Cn1c(nnn1)SCC1C(=O)N(C(C2=C1NC(=O)C(C2C)N)=O)C)O"  # Aspirin
predict_side_effect(new_smiles, model, scaler, data)


Prediction for SMILES: O=C(Cn1c(nnn1)SCC1C(=O)N(C(C2=C1NC(=O)C(C2C)N)=O)C)O
Severity Score: 0.0055
Model Prediction: Not Severe




TypeError: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type float