In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [2]:
file_path = "filtered_descriptors1.xlsx"
df = pd.read_excel(file_path)

In [3]:
y = df["Comprehensive Toxicity Value"]
X = df.drop(columns=["name", "Comprehensive Toxicity Value"])

In [4]:
lasso_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LassoCV(cv=5, random_state=42, max_iter=10000))
])
lasso_pipe.fit(X, y)

In [5]:
lasso_coef = lasso_pipe.named_steps["model"].coef_
lasso_selected = X.columns[lasso_coef != 0].tolist()

In [6]:
lasso_scores = cross_val_score(lasso_pipe, X, y, cv=10, scoring="r2")
mean_lasso_score = np.mean(lasso_scores)

In [7]:
rf_model = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
rf_model.fit(X, y)

In [8]:
rf_scores = cross_val_score(rf_model, X, y, cv=10, scoring="r2")
mean_rf_score = np.mean(rf_scores)

In [9]:
importances = rf_model.feature_importances_
threshold = np.percentile(importances, 80)
rf_selected = X.columns[importances >= threshold].tolist()

In [10]:
common_selected = sorted(set(lasso_selected) & set(rf_selected))

In [11]:
df_lasso = pd.concat([df[["name", "Comprehensive Toxicity Value"]], X[lasso_selected]], axis=1)
df_rf = pd.concat([df[["name", "Comprehensive Toxicity Value"]], X[rf_selected]], axis=1)
df_common = pd.concat([df[["name", "Comprehensive Toxicity Value"]], X[common_selected]], axis=1)

df_lasso.to_excel("descriptors_lasso12.xlsx", index=False)
df_rf.to_excel("descriptors_rf12.xlsx", index=False)
df_common.to_excel("descriptors_common12.xlsx", index=False)

In [12]:
print("✅ Lasso cross-validation R² score:", mean_lasso_score)
print("✅ Random Forest cross validation R² score:", mean_rf_score)
print("✅ Lasso select the number of features:", len(lasso_selected))
print("✅ Random Forest selects the number of features:", len(rf_selected))
print("✅ The number of intersection features of the two:", len(common_selected))
print("✅ Exported file: descriptors_lasso12.xlsx / descriptors_rf12.xlsx / descriptors_common12.xlsx")

✅ Lasso cross-validation R² score: 0.8365250640083055
✅ Random Forest cross validation R² score: 0.8666559635736926
✅ Lasso select the number of features: 81
✅ Random Forest selects the number of features: 61
✅ The number of intersection features of the two: 26
✅ Exported file: descriptors_lasso12.xlsx / descriptors_rf12.xlsx / descriptors_common12.xlsx
