### Задание

#### 1. Получите данные и загрузите их в рабочую среду.

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


#### 2. Подготовьте датасет к обучению моделей: категориальные переменные переведите в цифровые значения (можно использовать `pd.get_dummies`, `preprocessing.LabelEncoder`).

In [4]:
le = LabelEncoder()

In [5]:
le.fit(df['Sex'])

In [6]:
df['Sex'] = le.transform(df['Sex'])

In [7]:
le.fit(df['ExerciseAngina'])

In [8]:
df['ExerciseAngina'] = le.transform(df['ExerciseAngina'])

In [9]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [10]:
df = pd.get_dummies(df, columns=['ChestPainType', 'RestingECG', 'ST_Slope'])

In [11]:
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,0,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1


#### 3. Разделите выборку на обучающее и тестовое подмножество. 80% данных оставить на обучающее множество, 20% на тестовое.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('HeartDisease', axis=1), df['HeartDisease'],
                                                    test_size=0.2,random_state=42)

#### 4. Обучите модель логистической регрессии с параметрами по умолчанию.

In [15]:
model_dflt = LogisticRegression()

In [16]:
model_dflt.fit(X_train, y_train)

#### 5. Подсчитайте основные метрики модели. Используйте следующие метрики и функцию: `cross_validate` (…, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1']).

In [18]:
cv_results_dflt = cross_validate(model_dflt, X_train, y_train, cv=10,
                                 scoring=['accuracy', 'recall', 'precision', 'f1'])

In [19]:
# Вывод результатов
print('Accuracy:', cv_results_dflt['test_accuracy'].mean())
print('Recall:', cv_results_dflt['test_recall'].mean())
print('Precision:', cv_results_dflt['test_precision'].mean())
print('F1 Score:', cv_results_dflt['test_f1'].mean())

Accuracy: 0.8649944465012958
Recall: 0.8927439024390245
Precision: 0.8666120052341206
F1 Score: 0.8790306118551247


#### 6. Оптимизируйте 3-4 параметра модели:

##### a) Используйте `GridSearchCV`.

In [20]:
params = {
    'C': [0.1, 1, 10, 100], 
    'penalty': ['l1', 'l2'], 
    'max_iter': [30, 50, 100, 150], 
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [22]:
grid_search = GridSearchCV(model_dflt, params, cv=10)

In [23]:
grid_search.fit(X_train, y_train)

In [24]:
best_params = grid_search.best_params_
print('Best Parameters:', best_params)

Best Parameters: {'C': 10, 'max_iter': 30, 'penalty': 'l1', 'solver': 'liblinear'}


In [26]:
model_gs = LogisticRegression(C=10, max_iter=30, penalty='l1', solver='liblinear')

In [27]:
model_gs.fit(X_train, y_train)

In [28]:
cv_results_gs = cross_validate(model_gs, X_train, y_train, cv=10,
                               scoring=['accuracy', 'recall', 'precision', 'f1'])

In [29]:
# Вывод результатов
print('Accuracy:', cv_results_gs['test_accuracy'].mean())
print('Recall:', cv_results_gs['test_recall'].mean())
print('Precision:', cv_results_gs['test_precision'].mean())
print('F1 Score:', cv_results_gs['test_f1'].mean())

Accuracy: 0.8677156608663458
Recall: 0.8952439024390243
Precision: 0.8689618344557593
F1 Score: 0.8813791273216502


##### b) Используйте `RandomizedSearchCV`.

In [30]:
random_search = RandomizedSearchCV(model_dflt, params, cv=10, n_iter=3)

In [31]:
random_search.fit(X_train, y_train)

In [32]:
best_params = random_search.best_params_
print('Best Parameters:', best_params)

Best Parameters: {'solver': 'newton-cholesky', 'penalty': 'l2', 'max_iter': 30, 'C': 0.1}


In [33]:
model_rs = LogisticRegression(C=0.1, max_iter=30, penalty='l2', solver='newton-cholesky')

In [34]:
model_rs.fit(X_train, y_train)

In [35]:
cv_results_rs = cross_validate(model_rs, X_train, y_train, cv=10,
                               scoring=['accuracy', 'recall', 'precision', 'f1'])

In [36]:
# Вывод результатов
print('Accuracy:', cv_results_rs['test_accuracy'].mean())
print('Recall:', cv_results_rs['test_recall'].mean())
print('Precision:', cv_results_rs['test_precision'].mean())
print('F1 Score:', cv_results_rs['test_f1'].mean())

Accuracy: 0.8609033691225472
Recall: 0.8927439024390245
Precision: 0.8602471069761425
F1 Score: 0.8756730397182837


##### c) Добавьте в п. 6b 2-5 моделей классификации и вариации их параметров.

In [38]:
models = [
    (LogisticRegression(), {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'max_iter': [30, 50, 100, 150],
                            'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}),
    
    (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'criterion': ['gini', 'entropy'],
                                'max_depth': [None, 5, 10], 'random_state':[42]}),
    
    (DecisionTreeClassifier(), {'max_depth': [None, 5, 10], 'criterion': ['gini', 'entropy'],
                                'min_samples_split': [2, 5, 10]}),
    
    (KNeighborsClassifier(), {'n_neighbors': list(range(1,20)), 'weights': ['uniform', 'distance'], 'p': [1,2,3]})    
]

In [39]:
best_model = None
best_parameters = {}
best_score = 0

for model, params in models:
    random_search = RandomizedSearchCV(model, params, cv=10, n_iter=3)
    random_search.fit(X_train, y_train)  
    current_score = random_search.best_score_
    
    if current_score > best_score:
        best_score = current_score
        best_model = random_search.best_estimator_
        best_parameters = random_search.best_params_

In [40]:
#Обучаем модель с best_score_ и наилучшими параметрами
model_with_best_score = best_model.set_params(**best_parameters)
model_with_best_score.fit(X_train, y_train)

In [41]:
#Подсчитываем необходимые метрики
cv_results_best_model = cross_validate(model_with_best_score, X_train, y_train, cv=10, 
                        scoring=['accuracy', 'recall', 'precision', 'f1'])

# Выводим результат
print('Accuracy:', cv_results_best_model['test_accuracy'].mean())
print('Recall:', cv_results_best_model['test_recall'].mean())
print('Precision:', cv_results_best_model['test_precision'].mean())
print('F1 Score:', cv_results_best_model['test_f1'].mean())

Accuracy: 0.8691040355423917
Recall: 0.9201829268292684
Precision: 0.8549863916911971
F1 Score: 0.8854446368216685


#### 7. Сформулируйте выводы по проделанной работе.

In [45]:
data = {
    'Model': [
        'LR-model default parameters',
        'LR-model GSCV optimized',
        'LR-model RSCV optimized',
        'Best model RSCV optimized'
    ],
    'Accuracy': [cv_results_dflt['test_accuracy'].mean(), cv_results_gs['test_accuracy'].mean(),
                 cv_results_rs['test_accuracy'].mean(), cv_results_best_model['test_accuracy'].mean()],
    
    'Recall': [cv_results_dflt['test_recall'].mean(), cv_results_gs['test_recall'].mean(),
                 cv_results_rs['test_recall'].mean(), cv_results_best_model['test_recall'].mean()],
    
    'Precision': [cv_results_dflt['test_precision'].mean(), cv_results_gs['test_precision'].mean(),
                 cv_results_rs['test_precision'].mean(), cv_results_best_model['test_precision'].mean()],
    
    'F1 Score': [cv_results_dflt['test_f1'].mean(), cv_results_gs['test_f1'].mean(),
                 cv_results_rs['test_f1'].mean(), cv_results_best_model['test_f1'].mean()]
}

df = pd.DataFrame(data).sort_values(by='Accuracy', ascending=False, ignore_index=True)

print(df)

                         Model  Accuracy    Recall  Precision  F1 Score
0    Best model RSCV optimized  0.869104  0.920183   0.854986  0.885445
1      LR-model GSCV optimized  0.867716  0.895244   0.868962  0.881379
2  LR-model default parameters  0.864994  0.892744   0.866612  0.879031
3      LR-model RSCV optimized  0.860903  0.892744   0.860247  0.875673


Видим, что оптимизация параметров моделей машинного обучения успешно работает.