## BBB_Martins

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
# Load your dataset
data1 = pd.read_csv('BBB_Martins_train_ECFP_R2B1024.csv')  # Replace with your file path
data2 = pd.read_csv('BBB_Martins_valid_ECFP_R2B1024.csv')  # Replace with your file path
data = pd.concat([data1, data2], ignore_index=True)
data

Unnamed: 0,Drug_ID,0,1,2,3,4,5,6,7,8,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,Terbutylchlorambucil,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,40730,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
2,cloxacillin,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,cefoperazone,0,1,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
4,rolitetracycline,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619,sibopirdine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1620,Sulfadiazine,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1621,2-methylpropanol,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1622,thialbarbital,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [6]:
# Split the dataset into features and target variable

X_train = data.drop(columns=['Drug_ID', 'label'])
y_train = data['label']

test = pd.read_csv('BBB_Martins_test_ECFP_R2B1024.csv')  # Replace with your file path
X_test = test.drop(columns=['Drug_ID', 'label'])
y_test = test['label']

In [7]:
# Split the dataset into features and target variable


## Feature Scaling (Standardization)
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LightGBM classifier
lgb_classifier = lgb.LGBMClassifier()

# Hyperparameter Tuning
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500]
}

grid_search = GridSearchCV(lgb_classifier, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model after grid search
best_model = grid_search.best_estimator_

# Using Cross-Validation for better validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)

# Train the model with the best parameters
best_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=False)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("Cross-Validation Scores:", cv_scores)
print("Average CV Score:", cv_scores.mean())
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)




Cross-Validation Scores: [0.8        0.84       0.92923077 0.91692308 0.91049383]
Average CV Score: 0.879329534662868
Accuracy: 0.8497536945812808
Precision: 0.8457142857142858
Recall: 0.976897689768977
F1 Score: 0.9065849923430321
Confusion Matrix:
 [[ 49  54]
 [  7 296]]


In [9]:
from sklearn.metrics import log_loss

y_pred_logits = best_model.predict_proba(X_test)[:, 1]  # get the probability for the positive class

# Compute BCEWithLogits
# Note: log_loss in sklearn computes the cross-entropy loss between true and predicted labels
bce_with_logits = log_loss(y_test, y_pred_logits)

print("BCEWithLogits Score:", bce_with_logits)

BCEWithLogits Score: 0.3797190096835016


## Neural Network

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [16]:
# # Load dataset
# data = pd.read_csv('path_to_your_dataset.csv')
# X = data.drop(columns=['Drug_ID', 'label']).values
# y = data['label'].values
# 
# # Data Preprocessing
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# Convert to PyTorch tensors
X = data.drop(columns=['Drug_ID', 'label']).values
y = data['label'].values
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# DataLoader
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

# Neural Network Model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)  # Removed sigmoid here as BCEWithLogitsLoss applies it internally
        return x

model = Net()

# Loss and Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training the Model
epochs = 10
for epoch in range(epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
total_loss = 0
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in val_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        total_loss += loss.item()
        
        predicted = (outputs.data > 0.5).float()
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()


bce_with_logits_score = total_loss / len(val_loader)
print(f'BCEWithLogits Score: {bce_with_logits_score}')
accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}%')

BCEWithLogits Score: 0.8062885212129913
Accuracy: 85.53846153846153%


In [20]:
# DataLoader
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

# Enhanced Neural Network Model
class EnhancedNet(nn.Module):
    def __init__(self):
        super(EnhancedNet, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 1)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = torch.sigmoid(self.fc4(x))
        return x

model = EnhancedNet()

# Loss and Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.05)

# Training the Model
epochs = 50
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
total_loss = 0
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in val_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        total_loss += loss.item()
        
        predicted = (outputs.data > 0.5).float()
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()


bce_with_logits_score = total_loss / len(val_loader)
print(f'BCEWithLogits Score: {bce_with_logits_score}')
accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}%')


BCEWithLogits Score: 0.4747089097897212
Accuracy: 89.53846153846153%


In [21]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# DataLoader
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

# Enhanced Neural Network Model with advanced features
class AdvancedNet(nn.Module):
    def __init__(self):
        super(AdvancedNet, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 1)

    def forward(self, x):
        x = nn.LeakyReLU(0.01)(self.bn1(self.fc1(x)))
        x = nn.LeakyReLU(0.01)(self.bn2(self.fc2(x)))
        x = nn.LeakyReLU(0.01)(self.bn3(self.fc3(x)))
        x = torch.sigmoid(self.fc4(x))
        return x

model = AdvancedNet()

# Loss and Optimizer with L2 Regularization
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)

# Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

# Training the Model with Early Stopping
epochs = 30
best_loss = float('inf')
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

    # Validation loss for scheduler and early stopping
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Reduce LR on plateau
    scheduler.step(val_loss)

    # Early stopping
    if val_loss < best_loss:
        best_loss = val_loss
    else:
        print(f"Early stopping at epoch {epoch}")
        break

# Final Evaluation
model.eval()
total_loss = 0
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in val_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        total_loss += loss.item()
        
        predicted = (outputs.data > 0.5).float()
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()


bce_with_logits_score = total_loss / len(val_loader)
print(f'BCEWithLogits Score: {bce_with_logits_score}')
accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}%')


Early stopping at epoch 7
BCEWithLogits Score: 0.4812813252210617
Accuracy: 90.15384615384616%


## CYP3A4_Veith

In [22]:
# Load your dataset
data1 = pd.read_csv('CYP3A4_Veith_train_ECFP_R2B1024.csv')  # Replace with your file path
data2 = pd.read_csv('CYP3A4_Veith_valid_ECFP_R2B1024.csv')  # Replace with your file path
data = pd.concat([data1, data2], ignore_index=True)
data

Unnamed: 0,Drug_ID,0,1,2,3,4,5,6,7,8,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,644510.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,644675.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,645063.0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,645164.0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,6602688.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9857,3233595.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
9858,73397.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9859,5342516.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
9860,1484761.0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Split the dataset into features and target variable

X_train = data.drop(columns=['Drug_ID', 'label'])
y_train = data['label']

test = pd.read_csv('CYP3A4_Veith_test_ECFP_R2B1024.csv')  # Replace with your file path
X_test = test.drop(columns=['Drug_ID', 'label'])
y_test = test['label']

In [24]:
# Split the dataset into features and target variable


## Feature Scaling (Standardization)
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LightGBM classifier
lgb_classifier = lgb.LGBMClassifier()

# Hyperparameter Tuning
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500]
}

grid_search = GridSearchCV(lgb_classifier, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model after grid search
best_model = grid_search.best_estimator_

# Using Cross-Validation for better validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)

# Train the model with the best parameters
best_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=False)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("Cross-Validation Scores:", cv_scores)
print("Average CV Score:", cv_scores.mean())
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Cross-Validation Scores: [0.82108464 0.80486569 0.76470588 0.69574037 0.77079108]
Average CV Score: 0.771437530392551
Accuracy: 0.8077858880778589
Precision: 0.7941495124593716
Recall: 0.7207472959685349
F1 Score: 0.7556701030927834
Confusion Matrix:
 [[1259  190]
 [ 284  733]]


In [25]:
from sklearn.metrics import log_loss

y_pred_logits = best_model.predict_proba(X_test)[:, 1]  # get the probability for the positive class

# Compute BCEWithLogits
# Note: log_loss in sklearn computes the cross-entropy loss between true and predicted labels
bce_with_logits = log_loss(y_test, y_pred_logits)

print("BCEWithLogits Score:", bce_with_logits)

BCEWithLogits Score: 0.42004587140973415


In [26]:

# # Load dataset
# data = pd.read_csv('path_to_your_dataset.csv')
# X = data.drop(columns=['Drug_ID', 'label']).values
# y = data['label'].values
# 
# # Data Preprocessing
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Reshape for 1D CNN: [batch, channels, length]
X_tensor = X_tensor.unsqueeze(1)

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# DataLoader
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

# 1D Convolutional Neural Network
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 1024, 128)
        self.fc2 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

model = CNN1D()

# Loss and Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the Model
epochs = 10
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
total_loss = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        total_loss += loss.item()

bce_with_logits_score = total_loss / len(val_loader)
print(f'BCEWithLogits Score: {bce_with_logits_score}')


BCEWithLogits Score: 0.6111782888571421
