In [1]:
import pandas as pd
import numpy as np

# Define the number of samples
num_samples = 1000

# Generate random data for each feature
data = {
    'Age': np.random.randint(0, 5, num_samples),  # Discrete, Range [0–4]
    'Sex': np.random.randint(0, 3, num_samples),  # Discrete, Range [0–2]
    'Smoking history': np.random.randint(0, 3, num_samples),  # Discrete, Range [0–2]
    'Response status': np.random.randint(0, 4, num_samples),  # Discrete, Range [0–3]

    'VDW': np.random.uniform(-60, -45, num_samples),  # Continuous, Range [−60 to −45]
    'EEL': np.random.uniform(-23, 11, num_samples),  # Continuous, Range [−23 to 11]
    'ESURF': np.random.uniform(-45, -1, num_samples),  # Continuous, Range [−45 to −1]
    'EPB': np.random.uniform(27, 40, num_samples),  # Continuous, Range [27 to 40]

    'Matching rates': np.random.randint(0, 18, num_samples),  # Discrete, Range [0, 17]
    'Convex atoms': np.random.randint(0, 44, num_samples),  # Discrete, Range [0, 43]
    'Connectivity': np.random.randint(0, 24, num_samples),  # Discrete, Range [0, 23]
    'Euclidean distance': np.random.uniform(30, 39, num_samples),  # Continuous, Range [30 to 39]
    'Hydrogen bonds': np.random.randint(775, 1651, num_samples),  # Discrete, Range [775 to 1650]

    'Response': np.random.randint(0, 2, num_samples)  # Discrete, Binary values [0, 1]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('synthetic_drug_response_data_with_response.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('synthetic_drug_response_data_with_response.csv')

# Separate features and target variable
X = df.drop(columns='Response')
y = df['Response']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.55
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.54      0.55       104
           1       0.52      0.55      0.54        96

    accuracy                           0.55       200
   macro avg       0.55      0.55      0.54       200
weighted avg       0.55      0.55      0.55       200



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_:.2f}")

# Get the best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.57


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distributions
param_distributions = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                    param_distributions=param_distributions,
                                    n_iter=100,
                                    cv=5,
                                    scoring='accuracy',
                                    n_jobs=-1,
                                    verbose=2,
                                    random_state=42)

# Fit RandomizedSearchCV
random_search.fit(X_train_scaled, y_train)

# Best parameters and best score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Score: {random_search.best_score_:.2f}")

# Get the best model
best_model = random_search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 179}
Best Score: 0.57


In [None]:
# Make predictions with the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.53
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.53      0.54       104
           1       0.51      0.53      0.52        96

    accuracy                           0.53       200
   macro avg       0.53      0.53      0.53       200
weighted avg       0.53      0.53      0.53       200



In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.55
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56       104
           1       0.53      0.53      0.53        96

    accuracy                           0.55       200
   macro avg       0.54      0.54      0.54       200
weighted avg       0.55      0.55      0.55       200

