# Машинное обучение

## Лабораторная №2

## Модели линейной классификации и регрессии

In [98]:
import requests
url = "https://drive.google.com/uc?export=download&id=1haqJjujRq_VLp4j4DXJUcWUmp9VmANqV"
response = requests.get(url)
with open("advertising.csv", "wb") as file:
    file.write(response.content)

In [99]:
url = "https://drive.google.com/uc?export=download&id=1vbbZejzkYh6pooJM5G41CICC3GYIPfUa"
response = requests.get(url)
with open("heart.csv", "wb") as file:
    file.write(response.content)

## Часть 1. Реклама


In [100]:
from sklearn.datasets import load_breast_cancer, make_regression
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("advertising.csv")
data.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


### 1.1 Извлечение новых признаков

In [101]:
data["total"] = data["TV"] + data["radio"] + data["newspaper"]
data.head()

Unnamed: 0,TV,radio,newspaper,sales,total
0,230.1,37.8,69.2,22.1,337.1
1,44.5,39.3,45.1,10.4,128.9
2,17.2,45.9,69.3,9.3,132.4
3,151.5,41.3,58.5,18.5,251.3
4,180.8,10.8,58.4,12.9,250.0


Ввёдем долю на каждый вид:

In [102]:
for col in ("TV", "radio", "newspaper"):
    data[f"{col}_share"] = data[f"{col}"]/data["total"]
data.head()

Unnamed: 0,TV,radio,newspaper,sales,total,TV_share,radio_share,newspaper_share
0,230.1,37.8,69.2,22.1,337.1,0.682587,0.112133,0.20528
1,44.5,39.3,45.1,10.4,128.9,0.345229,0.304888,0.349884
2,17.2,45.9,69.3,9.3,132.4,0.129909,0.346677,0.523414
3,151.5,41.3,58.5,18.5,251.3,0.602865,0.164345,0.232789
4,180.8,10.8,58.4,12.9,250.0,0.7232,0.0432,0.2336


Введём также полиномиальные признаки, отражающие взаимодействие между `TV`, `radio` и `newspaper`:

In [103]:
spheres = ("TV", "radio", "newspaper")
for i in range(len(spheres)):
    col1 = spheres[i]
    for j in range(len(spheres)):
        col2 = spheres[j]
        data[f"{col1}_{col2}"] = data[f"{col}"]*data[f"{col2}"]
data.head()

Unnamed: 0,TV,radio,newspaper,sales,total,TV_share,radio_share,newspaper_share,TV_TV,TV_radio,TV_newspaper,radio_TV,radio_radio,radio_newspaper,newspaper_TV,newspaper_radio,newspaper_newspaper
0,230.1,37.8,69.2,22.1,337.1,0.682587,0.112133,0.20528,15922.92,2615.76,4788.64,15922.92,2615.76,4788.64,15922.92,2615.76,4788.64
1,44.5,39.3,45.1,10.4,128.9,0.345229,0.304888,0.349884,2006.95,1772.43,2034.01,2006.95,1772.43,2034.01,2006.95,1772.43,2034.01
2,17.2,45.9,69.3,9.3,132.4,0.129909,0.346677,0.523414,1191.96,3180.87,4802.49,1191.96,3180.87,4802.49,1191.96,3180.87,4802.49
3,151.5,41.3,58.5,18.5,251.3,0.602865,0.164345,0.232789,8862.75,2416.05,3422.25,8862.75,2416.05,3422.25,8862.75,2416.05,3422.25
4,180.8,10.8,58.4,12.9,250.0,0.7232,0.0432,0.2336,10558.72,630.72,3410.56,10558.72,630.72,3410.56,10558.72,630.72,3410.56


### 1.2 Преобразование признаков

Применим Z-преобразование:
$$
x_{new} = \frac{x-\mu}{\sigma}
$$

Таким образом переведём каждый признак к нормальному распределению с параметрами $\mu = 0, \  \sigma = 1$, что позволит моделям, чувствительным к высоким значениям, давать лучшие результаты.

In [104]:
data = (data - data.mean())/data.std()
data.head()

Unnamed: 0,TV,radio,newspaper,sales,total,TV_share,radio_share,newspaper_share,TV_TV,TV_radio,TV_newspaper,radio_TV,radio_radio,radio_newspaper,newspaper_TV,newspaper_radio,newspaper_newspaper
0,0.967425,0.979066,1.774493,1.548168,1.465174,0.020627,-0.234639,0.188133,2.325078,1.909778,1.815599,2.325078,1.909778,1.815599,2.325078,1.909778,1.815599
1,-1.194379,1.080097,0.667903,-0.694304,-0.773892,-1.528446,1.223583,1.213809,-0.531991,1.010532,0.337304,-0.531991,1.010532,0.337304,-0.531991,1.010532,0.337304
2,-1.51236,1.524637,1.779084,-0.905135,-0.736252,-2.517146,1.539726,2.444664,-0.699315,2.512356,1.823032,-0.699315,2.512356,1.823032,-0.699315,2.512356,1.823032
3,0.051919,1.214806,1.283185,0.858177,0.542447,-0.345438,0.160358,0.383256,0.875564,1.696826,1.082314,0.875564,1.696826,1.082314,0.875564,1.696826,1.082314
4,0.393196,-0.839507,1.278593,-0.215143,0.528466,0.207114,-0.756128,0.389005,1.223761,-0.206876,1.076041,1.223761,-0.206876,1.076041,1.223761,-0.206876,1.076041


### 1.3 Отбор признаков и объектов

Посмотрим, есть ли у нас сильно зависимые признаки:

In [105]:
corr_matrix = data.corr().abs() > 0.8
corr_matrix

Unnamed: 0,TV,radio,newspaper,sales,total,TV_share,radio_share,newspaper_share,TV_TV,TV_radio,TV_newspaper,radio_TV,radio_radio,radio_newspaper,newspaper_TV,newspaper_radio,newspaper_newspaper
TV,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
radio,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
newspaper,False,False,True,False,False,False,False,False,False,True,True,False,True,True,False,True,True
sales,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False
total,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False
TV_share,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False
radio_share,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
newspaper_share,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False
TV_TV,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False
TV_radio,False,False,True,False,False,False,False,False,False,True,True,False,True,True,False,True,True


In [106]:
corr = data.corr().abs()
upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
to_drop = [column for column in upper_triangle.columns if (upper_triangle[column] > 0.8).sum() > 0]
data.drop(columns=to_drop, inplace=True)

### 1.4 Разделение на обучающую и тестовую выборки



In [107]:
RANDOM_STATE = 42
sales = data["sales"]
data.drop(columns=["sales"], inplace=True)

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(data, sales, test_size=0.2, random_state=RANDOM_STATE)

### 1.5. Регрессия с разными функциями потерь и методами регуляризации

#### 1.5.1 Функции потерь

1. **Squared Error Loss**  
Основная функция для регрессии:  
$$
\text{SquaredError}(y, \hat{y}) = \frac{1}{N} \sum_{i=1}^N (y_i - \hat{y}_i)^2
$$

2. **Huber Loss**  
Устойчива к выбросам:  
$$
\text{Huber}(y, \hat{y}) =
\begin{cases}
\frac{1}{2}(y - \hat{y})^2, & \text{если } |y - \hat{y}| \leq \delta \\
\delta \cdot |y - \hat{y}| - \frac{1}{2}\delta^2, & \text{если } |y - \hat{y}| > \delta
\end{cases}
$$

3. **Epsilon-Insensitive Loss**  
Игнорирует ошибки меньше $\epsilon$:  
$$
\text{EpsilonInsensitive}(y, \hat{y}) =
\begin{cases}
0, & |y - \hat{y}| \leq \epsilon \\
|y - \hat{y}| - \epsilon, & |y - \hat{y}| > \epsilon
\end{cases}
$$

#### 1.5.2 Регуляризация
Регуляризация уменьшает вероятность переобучения.

- **L2-регуляризация (Ridge)**:
$$
\Omega(w) = \frac{1}{2} \|w\|^2
$$
  Штрафует большие значения весов.

- **L1-регуляризация (Lasso)**:
$$
\Omega(w) = \|w\|_1
$$
  Способствует разреженности.

- **ElasticNet**: комбинация L1 и L2:
$$
\Omega(w) = \alpha \|w\|_1 + \frac{1 - \alpha}{2} \|w\|^2
$$

In [108]:
from sklearn.linear_model import SGDRegressor

# Squared Error Loss
reg_squared = SGDRegressor(loss='squared_error', penalty='l2', max_iter=1000, random_state=42)
reg_squared.fit(X_reg_train, y_reg_train)

# Huber Loss
reg_huber = SGDRegressor(loss='huber', penalty='l2', max_iter=1000, random_state=42)
reg_huber.fit(X_reg_train, y_reg_train)

# Epsilon-Insensitive Loss
reg_epsilon = SGDRegressor(loss='epsilon_insensitive', penalty='l2', max_iter=1000, random_state=42)
reg_epsilon.fit(X_reg_train, y_reg_train)

### 1.6 Найдем лучшие параметры модели

In [109]:
from sklearn.model_selection import GridSearchCV

param_grid_reg = {
    'loss': ['squared_error', 'huber', 'epsilon_insensitive'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01]
}

grid_reg = GridSearchCV(SGDRegressor(max_iter=1000, tol=1e-3), param_grid=param_grid_reg, cv=5)
grid_reg.fit(X_reg_train, y_reg_train)
# best_estimator_
print(f"Лучшие параметры регрессии: {grid_reg.best_params_}")

Лучшие параметры регрессии: {'alpha': 0.01, 'loss': 'epsilon_insensitive', 'penalty': 'l2'}


### 1.7 Оценка регрессии

#### **Метрики для регрессии**

1. **Mean Squared Error (MSE)**  
$$
\text{MSE} = \frac{1}{N} \sum_{i=1}^N (y_i - \hat{y}_i)^2
$$
   - Наиболее популярна, но чувствительна к выбросам.

2. **Mean Absolute Error (MAE)**  
$$
\text{MAE} = \frac{1}{N} \sum_{i=1}^N |y_i - \hat{y}_i|
$$
   - Устойчива к выбросам.

3. **$R^2$ (Коэффициент детерминации)**  
$$
R^2 = 1 - \frac{\sum_{i=1}^N (y_i - \hat{y}_i)^2}{\sum_{i=1}^N (y_i - \bar{y})^2}
$$
   - Показывает, насколько хорошо модель объясняет разброс данных.

In [110]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

best_grid_reg = grid_reg.best_estimator_

y_pred = best_grid_reg.predict(X_reg_test)
mse_squared = mean_squared_error(y_reg_test, y_pred)
mae_squared = mean_absolute_error(y_reg_test, y_pred)
r2_squared = r2_score(y_reg_test, y_pred)

print(f"MSE: {mse_squared:.2f}, MAE: {mae_squared:.2f}, R^2: {r2_squared:.2f}")

MSE: 0.04, MAE: 0.16, R^2: 0.96


### Часть 2. Сердечный заболевания




In [111]:
data1 = pd.read_csv("heart.csv")
data1.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [112]:
target = data1["target"]
data1.drop(columns=["target"], inplace=True)

### 2.1 Извлечение новых признаков
One-hot кодирование категориальных признаков

In [113]:
numeric_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]
numeric_data = data1[numeric_features]

categorical = np.setdiff1d(data1.columns, numeric_features)
categorical

array(['ca', 'cp', 'exang', 'fbs', 'restecg', 'sex', 'slope', 'thal'],
      dtype=object)

In [114]:
data1_upd = numeric_data
for category in categorical:
    dummies = pd.get_dummies(data1[category], prefix=category, dtype=int)
    data1_upd = pd.concat([data1_upd, dummies], axis=1)
data1_upd.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca_0,ca_1,ca_2,ca_3,ca_4,...,restecg_2,sex_0,sex_1,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,63,145,233,150,2.3,1,0,0,0,0,...,0,0,1,1,0,0,0,1,0,0
1,37,130,250,187,3.5,1,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
2,41,130,204,172,1.4,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
3,56,120,236,178,0.8,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,57,120,354,163,0.6,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


### 2.2 Преобразование признаков

In [115]:
data1_upd[numeric_features] = (data1_upd[numeric_features] - data1_upd[numeric_features].mean())/data1_upd[numeric_features].std()
data1_upd

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca_0,ca_1,ca_2,ca_3,ca_4,...,restecg_2,sex_0,sex_1,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,0.950624,0.762694,-0.255910,0.015417,1.085542,1,0,0,0,0,...,0,0,1,1,0,0,0,1,0,0
1,-1.912150,-0.092585,0.072080,1.630774,2.119067,1,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
2,-1.471723,-0.092585,-0.815424,0.975900,0.310399,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
3,0.179877,-0.662770,-0.198030,1.237849,-0.206364,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,0.289984,-0.662770,2.078611,0.582975,-0.378618,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.289984,0.477601,-0.101562,-1.163356,-0.723126,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
299,-1.031296,-1.232956,0.342190,-0.770432,0.138144,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
300,1.501157,0.705675,-1.027653,-0.377507,2.032940,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1
301,0.289984,-0.092585,-2.223854,-1.512623,0.138144,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1


### 2.3 Отбор признаков и объектов

In [116]:
corr_matrix = numeric_data.corr().abs() > 0.8
upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
to_drop = [column for column in upper_triangle.columns if (upper_triangle[column] > 0.8).sum() > 0]
data1_upd.drop(columns=to_drop, inplace=True)

### 2.4 Разделение на обучающую и тестовую выборки



In [117]:
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(data1_upd, target, test_size=0.2, random_state=RANDOM_STATE)

### 2.5 Классификация с разными функциями потерь и методами регуляризации

#### 2.5.1 Функции потерь

1. **Perceptron Loss**  
Используется в классическом перцептроне:
$$
\text{Perceptron}(y, \hat{y}) =
\begin{cases}
0, & y \cdot \hat{y} \geq 0 \\
-y \cdot \hat{y}, & y \cdot \hat{y} < 0
\end{cases}
$$

2. **Hinge Loss**  
Популярна для SVM:  
$$
\text{Hinge}(y, \hat{y}) = \max(0, 1 - y \cdot \hat{y})
$$

3. **Squared Hinge Loss**  
Квадратичная версия `Hinge`:  
$$
\text{SquaredHinge}(y, \hat{y}) = \left( \max(0, 1 - y \cdot \hat{y}) \right)^2
$$

In [118]:
from sklearn.linear_model import SGDClassifier

# Perceptron Loss
clf_perceptron = SGDClassifier(loss='perceptron', penalty='l2', max_iter=1000, random_state=42)
clf_perceptron.fit(X_clf_train, y_clf_train)

# Hinge Loss
clf_hinge = SGDClassifier(loss='hinge', penalty='l2', max_iter=1000, random_state=42)
clf_hinge.fit(X_clf_train, y_clf_train)

# Squared Hinge Loss
clf_squared_hinge = SGDClassifier(loss='squared_hinge', penalty='l2', max_iter=1000, random_state=42)
clf_squared_hinge.fit(X_clf_train, y_clf_train)

### 2.6 Найдем лучшие параметры модели

In [119]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [120]:
from sklearn.model_selection import GridSearchCV

# Параметры для подбора
param_grid_clf = {
    'loss': ['hinge', 'squared_hinge', 'perceptron'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': np.linspace(1e-4, 1, 100)
}

grid_clf = GridSearchCV(SGDClassifier(max_iter=1000, tol=1e-4), param_grid=param_grid_clf, cv=5)
grid_clf.fit(X_clf_train, y_clf_train)

print(f"Лучшие параметры классификации: {grid_clf.best_params_}")

Лучшие параметры классификации: {'alpha': 0.0405, 'loss': 'hinge', 'penalty': 'l1'}


### 2.7 Оценка классификации
####**Метрики для классификации**

#### Основные метрики
1. **Accuracy (Точность)**  
$$
\text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}
$$
   - Подходит для сбалансированных данных.

2. **Precision (Точность по классу)**  
$$
\text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}
$$
   - Указывает, насколько точно модель классифицирует положительный класс.

3. **Recall (Полнота)**  
$$
\text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}
$$
   - Показывает долю найденных положительных объектов из всех положительных.

4. **F1-Score** (Гармоническое среднее Precision и Recall):
$$
\text{F1} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
$$

5. **ROC AUC (Площадь под кривой ROC)**  
$$
\text{AUC} = \int_0^1 \text{TPR}(x) \, dx
$$
   - Удобна для оценки бинарной классификации.

In [121]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

clf = grid_clf.best_estimator_
y_pred = clf.predict(X_clf_test)
acc = accuracy_score(y_clf_test, y_pred)
prec = precision_score(y_clf_test, y_pred)
rec = recall_score(y_clf_test, y_pred)
f1 = f1_score(y_clf_test, y_pred)
auc = roc_auc_score(y_clf_test, clf.decision_function(X_clf_test))

print(f"Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}, F1: {f1:.2f}, AUC: {auc:.2f}")

Accuracy: 0.87, Precision: 0.88, Recall: 0.88, F1: 0.88, AUC: 0.94
