In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Модели
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


# Подбор значений
from sklearn.model_selection import GridSearchCV

# **1 ИМПОРТ ДАННЫХ**

In [3]:
df = pd.read_csv('D:\mifi\myvenv\Курсовая работа\Курсовая работа\data\df_cleaned.csv')
df.head()

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,MinPartialCharge,...,fr_thiophene,fr_unbrch_alkane,fr_urea,pca_FpDensityMorgan,pca_PEOE,pca_SMR,pca_EState,pca_Chi,pca_VSA,pca_Kappa
0,6.239374,175.482382,28.125,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,-0.293526,...,0,3,0,-1.439069,46.495347,-76.964054,52.268508,11.369482,-21.239901,3.196119
1,0.771831,5.402819,7.0,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,-0.313407,...,0,3,0,-1.499436,60.06963,-83.878971,68.734212,12.488381,-19.896444,3.851082
2,223.808778,161.14232,0.72,2.627117,0.543231,0.543231,0.260923,42.1875,446.808,-0.325573,...,0,3,0,-1.6635,35.848918,-82.342811,83.648215,21.180989,-20.465637,7.558456
3,1.705624,107.855654,63.235294,5.09736,0.390603,0.390603,0.377846,41.862069,398.679,-0.293526,...,0,4,0,-1.476575,50.551237,-80.868829,56.263481,12.999417,-21.541365,4.352762
4,107.131532,139.270991,1.3,5.15051,0.270476,0.270476,0.429038,36.514286,466.713,-0.257239,...,0,0,0,-1.539143,73.55671,-20.80081,77.569576,19.312102,-19.210464,5.877875


# **2 РАЗДЕЛЕНИЕ ДАННЫХ**

In [4]:
# Шаг 1: отделяем X и y
X=df.drop('SI', axis=1)
y=df['SI']

# Шаг 2: делим с перемешиванием (shuffle=True по умолчанию)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

Подготовлены датасеты для обучения и тестирования

# **3 НОРМАЛИЗАЦИЯ**

In [5]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Данные стандартизированы

# **4 ПОДБОР НАИЛУЧШЕЙ МОДЕЛИ**

In [10]:
# Список моделей
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(random_state=42),
    "Lasso Regression": Lasso(random_state=42),
    "ElasticNet": ElasticNet(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
}

# Тренируем и оцениваем каждую модель
mae_scores = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores[name] = mae
    print(f"{name} MAE: {mae:.4f}")

Linear Regression MAE: 43.3183
Ridge Regression MAE: 38.2794
Lasso Regression MAE: 36.1344
ElasticNet MAE: 39.8196
Decision Tree MAE: 18.9350
Random Forest MAE: 13.6265
Gradient Boosting MAE: 15.0569


Наилучшей образом показала себя модель Random Forest, так как она больше всех устойчива к мультиколлинеарности


# **5 ПОДБОР ПАРАМЕТРОВ**

In [9]:
# Список моделей + их параметров для подбора
models_with_params = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge Regression": {
        "model": Ridge(random_state=42),
        "params": {"alpha": [0.01, 0.1, 1.0, 10.0]}
    },
    "Lasso Regression": {
        "model": Lasso(random_state=42),
        "params": {"alpha": [0.001, 0.01, 0.1, 1.0]}
    },
    "ElasticNet": {
        "model": ElasticNet(random_state=42),
        "params": {
            "alpha": [0.01, 0.1, 1.0],
            "l1_ratio": [0.1, 0.5, 0.7]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(random_state=42),
        "params": {
            "max_depth": [3, 5, 7, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5]
        }
    },
}

mae_scores = {}

for name, mp in models_with_params.items():
    if len(mp["params"]) == 0:
        # Если нет параметров для подбора — просто обучаем
        model = mp["model"]
        model.fit(X_train_scaled, y_train)
    else:
        # Иначе — делаем Grid Search
        grid = GridSearchCV(mp["model"], mp["params"], scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
        grid.fit(X_train_scaled, y_train)
        model = grid.best_estimator_
    
    # Оценка на тестовой выборке
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores[name] = mae
    print(f"{name} MAE: {mae:.4f}")
    if len(mp["params"]) > 0:
        print(f"  Best params: {grid.best_params_}")

Linear Regression MAE: 43.3183
Ridge Regression MAE: 36.8357
  Best params: {'alpha': 10.0}
Lasso Regression MAE: 36.1344
  Best params: {'alpha': 1.0}
ElasticNet MAE: 37.2732
  Best params: {'alpha': 0.1, 'l1_ratio': 0.1}
Decision Tree MAE: 20.7129
  Best params: {'max_depth': 7, 'min_samples_split': 2}
Random Forest MAE: 13.6843
  Best params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Gradient Boosting MAE: 15.0458
  Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


Random Forest показал себя лучше всех, но и все остальные алгоритмы заметно лучше справились с задачей, связано это снова с линейностью в данных, так как из условия задачи мы знаем, что SI образуется из CC50 и IC50