In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Подбор значений
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score

# **1 ИМПОРТ ДАННЫХ**

In [2]:
df = pd.read_csv('D:\mifi\myvenv\Курсовая работа\Курсовая работа\data\df_cleaned.csv')
df.head()

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,MinPartialCharge,...,fr_thiophene,fr_unbrch_alkane,fr_urea,pca_FpDensityMorgan,pca_PEOE,pca_SMR,pca_EState,pca_Chi,pca_VSA,pca_Kappa
0,6.239374,175.482382,28.125,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,-0.293526,...,0,3,0,-1.439069,46.495347,-76.964054,52.268508,11.369482,-21.239901,3.196119
1,0.771831,5.402819,7.0,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,-0.313407,...,0,3,0,-1.499436,60.06963,-83.878971,68.734212,12.488381,-19.896444,3.851082
2,223.808778,161.14232,0.72,2.627117,0.543231,0.543231,0.260923,42.1875,446.808,-0.325573,...,0,3,0,-1.6635,35.848918,-82.342811,83.648215,21.180989,-20.465637,7.558456
3,1.705624,107.855654,63.235294,5.09736,0.390603,0.390603,0.377846,41.862069,398.679,-0.293526,...,0,4,0,-1.476575,50.551237,-80.868829,56.263481,12.999417,-21.541365,4.352762
4,107.131532,139.270991,1.3,5.15051,0.270476,0.270476,0.429038,36.514286,466.713,-0.257239,...,0,0,0,-1.539143,73.55671,-20.80081,77.569576,19.312102,-19.210464,5.877875


# **2 РАЗДЕЛЕНИЕ ДАННЫХ**

In [3]:
# Создание целевого столбца, где 1 - это больше медианы, 0 - меньше
df['SI8_Y'] = df['SI'].apply(lambda x: 1 if x > 8 else 0) 

# Шаг 1: отделяем X и y
X=df.drop(columns=['SI', 'SI8_Y'], axis=1)
y=df['SI8_Y']

# Шаг 2: делим с перемешиванием (shuffle=True по умолчанию)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# **3 НОРМАЛИЗАЦИЯ**

In [4]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **4 ПОДБОР НАИЛУЧШЕЙ МОДЕЛИ**

In [5]:
# Список моделей
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Ridge Classifier": RidgeClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(random_state=42),
}

# Обучаем и оцениваем каждую модель
results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 40)

Logistic Regression Accuracy: 0.7927
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       131
           1       0.76      0.52      0.62        62

    accuracy                           0.79       193
   macro avg       0.78      0.72      0.74       193
weighted avg       0.79      0.79      0.78       193

----------------------------------------
Ridge Classifier Accuracy: 0.8031
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       131
           1       0.77      0.55      0.64        62

    accuracy                           0.80       193
   macro avg       0.79      0.74      0.75       193
weighted avg       0.80      0.80      0.79       193

----------------------------------------
Decision Tree Accuracy: 0.9378
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       131
           1       0.88      0.94      0.91        62

Gradient Boosting Accuracy: 0.9689, самый лучший результат среди всех моделей.

# **5 ПОДБОР ПАРАМЕТРОВ**

In [6]:
# Список моделей и их гиперпараметров для настройки
models_with_params = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {
            "C": [0.01, 0.1, 1, 10],           # Обратная сила регуляризации
            "penalty": ['l1', 'l2'],          # Тип регуляризации
            "solver": ['liblinear']           # Совместимый solver для l1 и l2
        }
    },
    "Ridge Classifier": {
        "model": RidgeClassifier(random_state=42),
        "params": {
            "alpha": [0.1, 1, 10]             # Параметр регуляризации
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 7, None],     # Глубина дерева
            "min_samples_split": [2, 5, 10]   # Минимальное число для разделения
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],   # Число деревьев
            "max_depth": [3, 5, 7, None],     # Глубина
            "min_samples_split": [2, 5]      # Минимум для разделения
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5]
        }
    },
    "SVM": {
        "model": SVC(random_state=42),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ['linear', 'rbf']
        }
    },
}

results = {}

for name, mp in models_with_params.items():
    print(f"\n{name}")
    
    # Если нет параметров — просто обучаем
    if not mp["params"]:
        model = mp["model"]
        model.fit(X_train_scaled, y_train)
        best_params = {}
    else:
        # Настройка гиперпараметров через GridSearch
        grid = GridSearchCV(mp["model"], mp["params"], cv=5, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train_scaled, y_train)
        model = grid.best_estimator_
        best_params = grid.best_params_
    
    # Предсказание и оценка
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    print(f"Best params: {best_params}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 40)


Logistic Regression
Best params: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.9637
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       131
           1       0.98      0.90      0.94        62

    accuracy                           0.96       193
   macro avg       0.97      0.95      0.96       193
weighted avg       0.96      0.96      0.96       193

----------------------------------------

Ridge Classifier
Best params: {'alpha': 0.1}
Accuracy: 0.8031
              precision    recall  f1-score   support

           0       0.82      0.92      0.86       131
           1       0.76      0.56      0.65        62

    accuracy                           0.80       193
   macro avg       0.79      0.74      0.76       193
weighted avg       0.80      0.80      0.79       193

----------------------------------------

Decision Tree
Best params: {'max_depth': 7, 'min_samples_split': 5}
Accuracy: 0.9637
              

Gradient Boosting и Logistic Regression — оченб близкие результаты

Gradient Boosting Accuracy = 0.9689, Logistic Regression Accuracy = 0.9637

Модели почти без ошибок определяют оба класса.

Проверим стабильность обоих моделей с помощью кросс-валидации

## 5.1 Проверка с помощью кросс-валидации

In [7]:
model = GradientBoostingClassifier(learning_rate = 0.1, max_depth =  5, n_estimators = 200)
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("Gradient Boosting CV Accuracy:", scores.mean())

model = LogisticRegression(C = 10, penalty = 'l1', solver = 'liblinear')
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("Logistic Regression CV Accuracy:", scores.mean())

Gradient Boosting CV Accuracy: 0.9518207282913165
Logistic Regression CV Accuracy: 0.9297173414820474


Результат Gradient Boosting CV Accuracy: 0.9518207282913165 оказался более стабильным, Gradient Boosting предпочтителен для финальной версии