In [15]:
!pip install xgboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb



In [16]:
df = pd.read_csv("D:\\My Programming\\tox21trainingdata.sdf\\feature_engineered_data.csv")

In [17]:
X = df[['MolWeight', 'NumAtoms', 'NumHeavyAtoms', 'NumRotatableBonds']]
y = df['toxicity_score']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05, max_depth=6)

In [20]:
xgb_model.fit(X_train, y_train)

In [21]:
y_pred = xgb_model.predict(X_test)

In [22]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [23]:
print('R2 score:', r2)
print('Mean absolute error:', mae)
print('Mean squared error:', mse)
print('Root mean squared error:', rmse)

R2 score: 0.4396948800921058
Mean absolute error: 1.1637416017726152
Mean squared error: 3.5348796393645014
Root mean squared error: 1.880127559333276


In [25]:
#Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

# Create PCA object with n_components
pca = PCA(n_components=2)

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train)

# Transform the test data
X_test_pca = pca.transform(X_train)


In [33]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
pca = PCA(n_components=4)
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca', pca), ('linear_reg', linear_reg)])
pipeline.fit(X_train, y_train)
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))
y_pred = pipeline.predict(X_test)
r2_score = r2_score(y_test, y_pred)
mean_absolute_error = mean_absolute_error(y_test, y_pred)
mean_squared_error = mean_squared_error(y_test, y_pred)
root_mean_squared_error = np.sqrt(mean_squared_error)
print("R2 score: {}".format(r2_score))
print("Mean absolute error: {}".format(mean_absolute_error))
print("Mean squared error: {}".format(mean_squared_error))
print("Root mean squared error: {}".format(root_mean_squared_error))



Cross-validation scores: [0.06210689 0.04768568 0.05211181 0.03089278 0.05763929]
Average cross-validation score: 0.05
R2 score: 0.07263698644902705
Mean absolute error: 1.6898195017928184
Mean squared error: 5.850591969318278
Root mean squared error: 2.418799695989372


In [37]:
#Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Create linear regression object
lr = LinearRegression()

# Create RFE object with n_features_to_select
rfe = RFE(estimator=lr, n_features_to_select=5)

# Fit RFE on training data
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Transform test data
X_test_rfe = rfe.transform(X_test)

# Get the selected features
selected_features = X_train.columns[rfe.support_]



In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Split the dataset into target variable (toxicity_score) and input features (selected_features)
X = df[selected_features]
y = df['toxicity_score']

# Split the dataset into training and testing sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a linear regression model on the training set
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the training and testing sets
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Evaluate the model's performance on the training and testing sets
print("Training set metrics:")
print("R2 score:", r2_score(y_train, y_train_pred))
print("Mean absolute error:", mean_absolute_error(y_train, y_train_pred))
print("Mean squared error:", mean_squared_error(y_train, y_train_pred))
print("Root mean squared error:", np.sqrt(mean_squared_error(y_train, y_train_pred)))

print("\nTesting set metrics:")
print("R2 score:", r2_score(y_test, y_test_pred))
print("Mean absolute error:", mean_absolute_error(y_test, y_test_pred))
print("Mean squared error:", mean_squared_error(y_test, y_test_pred))
print("Root mean squared error:", np.sqrt(mean_squared_error(y_test, y_test_pred)))


Training set metrics:
R2 score: 0.05085525965058246
Mean absolute error: 1.7311201525987592
Mean squared error: 6.557909754132812
Root mean squared error: 2.560841610512609

Testing set metrics:
R2 score: 0.07387302405161289
Mean absolute error: 1.7088744155070774
Mean squared error: 5.867634024623084
Root mean squared error: 2.422319967432685


In [40]:
#Normalization
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
scaler = MinMaxScaler()

# Fit and transform scaler on training data
X_train_norm = scaler.fit_transform(X_train)

# Transform test data
X_test_norm = scaler.transform(X_test)


In [42]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict the toxicity scores for the training and testing sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Evaluate the model on the training and testing sets
print("Training set metrics:")
print("R2 score:", r2_score(y_train, y_train_pred))
print("Mean absolute error:", mean_absolute_error(y_train, y_train_pred))
print("Mean squared error:", mean_squared_error(y_train, y_train_pred))
print("Root mean squared error:", mean_squared_error(y_train, y_train_pred, squared=False))

print("\nTesting set metrics:")
print("R2 score:", r2_score(y_test, y_test_pred))
print("Mean absolute error:", mean_absolute_error(y_test, y_test_pred))
print("Mean squared error:", mean_squared_error(y_test, y_test_pred))
print("Root mean squared error:", mean_squared_error(y_test, y_test_pred, squared=False))

Training set metrics:
R2 score: 0.05085525965058235
Mean absolute error: 1.7311201525987592
Mean squared error: 6.557909754132813
Root mean squared error: 2.560841610512609

Testing set metrics:
R2 score: 0.07387302405161311
Mean absolute error: 1.7088744155070774
Mean squared error: 5.867634024623083
Root mean squared error: 2.422319967432685


In [44]:
#hyperparameter tuning using RandomizedSearchCV
from scipy.stats import uniform
from sklearn.linear_model import Ridge

param_dist = {'alpha': uniform(0, 1)}
model = Ridge()
from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(model, param_distributions=param_dist, cv=5)
search.fit(X_train, y_train)
print(search.best_params_)
best_model = Ridge(alpha=search.best_params_['alpha'])
best_model.fit(X_train, y_train)

# Evaluate the model on the testing data
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R2 score: {r2}")
print(f"Mean absolute error: {mae}")
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")



{'alpha': 0.81768981027936}
R2 score: 0.07387297018862349
Mean absolute error: 1.708874602632774
Mean squared error: 5.86763436588116
Root mean squared error: 2.422320037873022


In [48]:
#Ensemble methods
#Bagging with Random Forest
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Instantiate the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Instantiate the Bagging model with Random Forest as the base estimator
bagging_rf = BaggingRegressor(base_estimator=rf, random_state=42)

# Fit the Bagging model to the training data
bagging_rf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_bagging_rf = bagging_rf.predict(X_test)

# Evaluate the performance of the Bagging model
print("Bagging with Random Forest:")
print("R2 score:", r2_score(y_test, y_pred_bagging_rf))
print("Mean absolute error:", mean_absolute_error(y_test, y_pred_bagging_rf))
print("Mean squared error:", mean_squared_error(y_test, y_pred_bagging_rf))
print("Root mean squared error:", np.sqrt(mean_squared_error(y_test, y_pred_bagging_rf)))






Bagging with Random Forest:
R2 score: 0.42092686729837336
Mean absolute error: 1.152133125845822
Mean squared error: 3.668815728756508
Root mean squared error: 1.9154152888490026


In [49]:
#Boosting with Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Instantiate the Gradient Boosting model
gb = GradientBoostingRegressor(random_state=42)

# Fit the Gradient Boosting model to the training data
gb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_gb = gb.predict(X_test)

# Evaluate the performance of the Gradient Boosting model
print("Boosting with Gradient Boosting:")
print("R2 score:", r2_score(y_test, y_pred_gb))
print("Mean absolute error:", mean_absolute_error(y_test, y_pred_gb))
print("Mean squared error:", mean_squared_error(y_test, y_pred_gb))
print("Root mean squared error:", np.sqrt(mean_squared_error(y_test, y_pred_gb)))


Boosting with Gradient Boosting:
R2 score: 0.19821133493722098
Mean absolute error: 1.5441379012507601
Mean squared error: 5.079867635711402
Root mean squared error: 2.253856170147377
