# Линейная и логистическая регрессия
- `dataset/heart_disease_uci.csv` - датасет для задачи классификации: необходимо предсказать, болен ли пациент больезнью сердца;
- `dataset/advertising.csv` - датасет для задачи регрессии: необходимо предсказать затраты на продажи.

## Определим бейзлайн для логистической и линейной регрессии

In [138]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

heart_df = pd.read_csv('dataset/heart_disease_uci.csv')

heart_df['target'] = (heart_df['num'] > 0).astype(int)
heart_df = heart_df.drop(columns=['id', 'dataset', 'num'], axis=1)

In [139]:
heart_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,target
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,1
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [140]:
X_class = heart_df.drop(['target'], axis=1)
y_class = heart_df['target']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

In [141]:
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
numeric_features = [col for col in X_class.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

logreg = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', LogisticRegression(max_iter=10000))])

logreg.fit(X_train_c, y_train_c)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [142]:
y_pred_c = logreg.predict(X_test_c)
print('Classification Baseline:')
print('Accuracy:', accuracy_score(y_test_c, y_pred_c))
print('Precision:', precision_score(y_test_c, y_pred_c))
print('Recall:', recall_score(y_test_c, y_pred_c))
print('F1:', f1_score(y_test_c, y_pred_c))

Classification Baseline:
Accuracy: 0.7989130434782609
Precision: 0.8529411764705882
Recall: 0.7981651376146789
F1: 0.8246445497630331


In [143]:
adv_df = pd.read_csv('dataset/advertising.csv')

X_reg = adv_df.drop('Sales', axis=1)
y_reg = adv_df['Sales']

lr = LinearRegression()
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
lr.fit(X_train_r, y_train_r)

y_pred_r = lr.predict(X_test_r)
print('Regression Baseline:')
print('MSE:', mean_squared_error(y_test_r, y_pred_r))
print('R2:', r2_score(y_test_r, y_pred_r))

Regression Baseline:
MSE: 2.9077569102710896
R2: 0.9059011844150826


### Попробуем улучшить бейзлайн 
В качестве улучшений можно использовать заполнение пропусков усредненными значениями, балансировку классов через oversample и подбор лучших параметров черзе grid search. 

In [144]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

numeric_transformer_imp = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer_imp = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor_imp = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_imp, numeric_features),
        ('cat', categorical_transformer_imp, categorical_features)
    ])

logreg_imp = ImbPipeline(steps=[
    ('preprocessor', preprocessor_imp),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=10000))
])

parameters = {'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'classifier__penalty':['elasticnet', 'l1', 'l2'],
              'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100]}
grid_c = GridSearchCV(logreg_imp, parameters, cv=5, scoring='f1')
grid_c.fit(X_train_c, y_train_c)

240 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/bulat/mai/ai/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/bulat/mai/ai/venv/lib/python3.12/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bulat/mai/ai/venv/lib/python3.12/site-packages/imblearn/pipeline.py", line 526, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/bulat/mai/ai/venv/lib/p

0,1,2
,estimator,Pipeline(step...iter=10000))])
,param_grid,"{'classifier__C': [0.001, 0.01, ...], 'classifier__penalty': ['elasticnet', 'l1', ...], 'classifier__solver': ['newton-cg', 'lbfgs', ...]}"
,scoring,'f1'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,10000


In [145]:
grid_c.best_params_

{'classifier__C': 1,
 'classifier__penalty': 'l1',
 'classifier__solver': 'liblinear'}

In [146]:
y_pred_c_imp = grid_c.predict(X_test_c)
print('Improved Classification:')
print('Accuracy:', accuracy_score(y_test_c, y_pred_c_imp))
print('Precision:', precision_score(y_test_c, y_pred_c_imp))
print('Recall:', recall_score(y_test_c, y_pred_c_imp))
print('F1:', f1_score(y_test_c, y_pred_c_imp))

Improved Classification:
Accuracy: 0.8152173913043478
Precision: 0.8787878787878788
Recall: 0.7981651376146789
F1: 0.8365384615384616


In [147]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_reg_poly = poly.fit_transform(X_reg)

X_train_rp, X_test_rp, y_train_r, y_test_r = train_test_split(X_reg_poly, y_reg, test_size=0.2, random_state=42)

ridge = Ridge()
param_grid_r = {'alpha': [0.1, 1, 10]}
grid_r = GridSearchCV(ridge, param_grid_r, cv=5)
grid_r.fit(X_train_rp, y_train_r)

0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': [0.1, 1, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,10
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [148]:
grid_r.best_params_

{'alpha': 10}

In [149]:
y_pred_r_imp = grid_r.predict(X_test_rp)
print('Improved Regression:')
print('MSE:', mean_squared_error(y_test_r, y_pred_r_imp))
print('R2:', r2_score(y_test_r, y_pred_r_imp))

Improved Regression:
MSE: 1.4426446100683141
R2: 0.9533141341224645


### Реализуем алгоритмы ML самостоятельно

In [150]:
import numpy as np

class CustomLogisticRegression:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.lr = learning_rate
        self.iterations = iterations
        self.weights = None
        self.bias = 0.0
        self.cost_history = []

    def sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))

    def cost(self, h, y):
        m = len(y)
        return - (1.0/m) * np.sum(y*np.log(h) + (1.0-y)*np.log(1.0-h))

    def fit(self, X, y):
        m, n = X.shape
        self.weights = np.zeros(n, dtype=np.float128)

        for _ in range(self.iterations):
            z = np.dot(X, self.weights) + self.bias
            h = self.sigmoid(z)

            dw = (1.0/m) * np.dot(X.T, (h - y))
            db = (1.0/m) * np.sum(h - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

            self.cost_history.append(self.cost(h, y))

    def predict(self, X):
        return (self.sigmoid(np.dot(X, self.weights) + self.bias) >= 0.5).astype(int)
    

X_train_c_num = preprocessor.fit_transform(X_train_c)
X_test_c_num = preprocessor.transform(X_test_c) 

logreg_scr = CustomLogisticRegression(learning_rate=0.01, iterations=1000)
logreg_scr.fit(X_train_c_num, y_train_c)

  return - (1.0/m) * np.sum(y*np.log(h) + (1.0-y)*np.log(1.0-h))


In [151]:
y_pred_c_scr = logreg_scr.predict(X_test_c_num)
print('Scratch Classification:')
print('Accuracy:', accuracy_score(y_test_c, y_pred_c_scr))
print('Precision:', precision_score(y_test_c, y_pred_c_scr))
print('Recall:', recall_score(y_test_c, y_pred_c_scr))
print('F1:', f1_score(y_test_c, y_pred_c_scr))

Scratch Classification:
Accuracy: 0.5380434782608695
Precision: 0.9
Recall: 0.24770642201834864
F1: 0.38848920863309355


In [152]:
class LinearRegressionCustom:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        self.scaler = StandardScaler()

    def fit(self, X, y):
        X = self.scaler.fit_transform(X)
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0.0

        for _ in range(self.n_iters):
            y_pred = np.dot(X, self.weights) + self.bias
            dw = (1.0 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1.0 / n_samples) * np.sum(y_pred - y)
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        X = self.scaler.transform(X)
        return np.dot(X, self.weights) + self.bias

lr_scr = LinearRegressionCustom(lr=0.01, n_iters=1000)
lr_scr.fit(X_train_r.values, y_train_r)

In [153]:
y_pred_r_scr = lr_scr.predict(X_test_r.values)
print('Scratch Regression:')
print('MSE:', mean_squared_error(y_test_r, y_pred_r_scr))
print('R2:', r2_score(y_test_r, y_pred_r_scr))

Scratch Regression:
MSE: 2.9085298277597373
R2: 0.90587617179454


### Улучшим имплементации техниками, примененными к бейзлайну

In [154]:
X_train_c_imp = preprocessor_imp.fit_transform(X_train_c)
X_test_c_imp = preprocessor_imp.transform(X_test_c)

logreg_scr.fit(X_train_c_imp, y_train_c)

In [155]:
y_pred_c_scr_imp = logreg_scr.predict(X_test_c_imp)
print('Scratch Improved Classification:')
print('Accuracy:', accuracy_score(y_test_c, y_pred_c_scr_imp))
print('Precision:', precision_score(y_test_c, y_pred_c_scr_imp))
print('Recall:', recall_score(y_test_c, y_pred_c_scr_imp))
print('F1:', f1_score(y_test_c, y_pred_c_scr_imp))

Scratch Improved Classification:
Accuracy: 0.8478260869565217
Precision: 0.900990099009901
Recall: 0.8348623853211009
F1: 0.8666666666666667


In [157]:
lr_scr.fit(X_train_rp, y_train_r)

y_pred_r_scr_imp = lr_scr.predict(X_test_rp)
print('Scratch Regression:')
print('MSE:', mean_squared_error(y_test_r, y_pred_r_scr_imp))
print('R2:', r2_score(y_test_r, y_pred_r_scr_imp))

Scratch Regression:
MSE: 2.6406463683714776
R2: 0.9145452308050044


### Выводы
#### По задаче классификации
|Модель            |Accuracy|Precision|Recall|F1-score|
|------------------|--------|---------|------|--------|
|Baseline (sklearn)| 0.8    | 0.85    | 0.8  | 0.82   |
|Improved (sklearn)| 0.82   | 0.88    | 0.8  | 0.84   |
|Baseline (custom) | 0.54   | 0.9     | 0.25 | 0.39   |
|Improved (custom) | 0.85   | 0.9     | 0.83 | 0.87   |

В случае кастомного бейзлайна плохие показатели метрик были следствием дисбаланса классов и числовых признаков, из-за чего происходило переполнение мат. выражений. После разрешения дисбаланса, показатели метрик значительно выросли.

#### По задаче регрессии
|Модель            |MSE |R2  |
|------------------|----|----|
|Baseline (sklearn)|2.91|0.91|
|Improved (sklearn)|1.44|0.95|
|Baseline (custom) |2.91|0.91|
|Improved (custom) |2.64|0.91|

В случае задачи регрессии удалось улучшить показатели метрик за счет использования полиномиальных фичей.
