In [82]:
import pandas as pd
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Преобразуем категориальные переменные

In [83]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['ChestPainType'] = le.fit_transform(df['ChestPainType'])
df['RestingECG'] = le.fit_transform(df['RestingECG'])
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df['ST_Slope'] = le.fit_transform(df['ST_Slope'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    int32  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int32  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int32(5), int64(6)
memory usage: 68.3 KB


In [84]:
from sklearn.model_selection import train_test_split
y = df['HeartDisease']
X = df.drop('HeartDisease',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

Обучаем логистическую регрессию

In [86]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train_transformed, y_train)

In [87]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X_test_transformed, y_test, cv=10, scoring=['accuracy','recall','precision','f1'])
for key, value in scores.items():
    a1 = round(value.mean(),3)
    print(key, a1)

fit_time 0.006
score_time 0.003
test_accuracy 0.81
test_recall 0.841
test_precision 0.837
test_f1 0.837


Подбираем параметры для логистической регрессии

In [88]:
from sklearn.model_selection import GridSearchCV
grid_params = {'C': [0.25,0.5,0.75,1], 'intercept_scaling': [0,1,2,3]}
grid = GridSearchCV(LogisticRegression(random_state=0), grid_params, cv= 10, n_jobs=-1, verbose=True)
grid.fit(X_train_transformed, y_train)
grid.best_params_

Fitting 10 folds for each of 16 candidates, totalling 160 fits


{'C': 0.25, 'intercept_scaling': 0}

In [89]:
clf = LogisticRegression(C=0.25, intercept_scaling=0, random_state=0).fit(X_train_transformed, y_train)
scores1 = cross_validate(clf, X_test_transformed, y_test, cv=10, scoring=['accuracy','recall','precision','f1'])
for key, value in scores1.items():
    print(key, round(value.mean(),3))

fit_time 0.003
score_time 0.006
test_accuracy 0.82
test_recall 0.841
test_precision 0.853
test_f1 0.845


In [90]:
from sklearn.model_selection import RandomizedSearchCV
rand = RandomizedSearchCV(LogisticRegression(random_state=0), grid_params, cv= 10, n_jobs=-1, verbose=True)
rand.fit(X_train_transformed, y_train)
rand.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


{'intercept_scaling': 1, 'C': 0.25}

In [91]:
clf = LogisticRegression(C=0.25, intercept_scaling=1, random_state=0).fit(X_train_transformed, y_train)
scores2 = cross_validate(clf, X_test_transformed, y_test, cv=10, scoring=['accuracy','recall','precision','f1'])
for key, value in scores2.items():
    a2 = round(value.mean(),3)
    print(key, a2)

fit_time 0.004
score_time 0.004
test_accuracy 0.82
test_recall 0.841
test_precision 0.853
test_f1 0.845


Подбираем параметры и обучаем SVC

In [92]:
from sklearn.svm import SVC
svc_params = {'C': [1, 2, 3], 'kernel':['linear']}
svc_grid = RandomizedSearchCV(SVC(), svc_params, cv= 5, n_jobs=-1, verbose=True)
svc_grid.fit(X_train_transformed, y_train)
svc_grid.best_params_



Fitting 5 folds for each of 3 candidates, totalling 15 fits


{'kernel': 'linear', 'C': 2}

In [93]:
svc = SVC(C=2, kernel='linear')
scores3 = cross_validate(svc, X_test_transformed, y_test, cv=10, scoring=['accuracy','recall','precision','f1'])
for key, value in scores3.items():
    a3 = round(value.mean(),3)
    print(key, a3)

fit_time 0.005
score_time 0.002
test_accuracy 0.842
test_recall 0.841
test_precision 0.889
test_f1 0.861


Подбираем параметры и обучаем случайный лес

In [94]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=17)
forest_params = {'max_depth': range(1,15),'max_features': range(4,11)}
forest_grid = RandomizedSearchCV(forest, forest_params, cv=5, n_jobs=-1, verbose=True)
forest_grid.fit(X_train_transformed, y_train)
forest_grid.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'max_features': 5, 'max_depth': 5}

In [100]:
forest = RandomForestClassifier(n_estimators=100, max_depth=16, max_features=6, n_jobs=-1, random_state=17)
scores4 = cross_validate(forest, X_test_transformed, y_test, cv=10, scoring=['accuracy','recall','precision','f1'])
for key, value in scores4.items():
    a4 = round(value.mean(),3)
    print(key, a4)

fit_time 0.127
score_time 0.019
test_accuracy 0.908
test_recall 0.925
test_precision 0.923
test_f1 0.922


In [101]:
f1 = pd.DataFrame([['Логистическая регрессия без оптимизации',a1],
                  ['Логистическая регрессия с оптимизацией',a2],
                  ['SVM с оптимизацией',a3],
                  ['Случайный лес с оптимизацией',a4]],
columns=['Модель','f1-score'])
f1

Unnamed: 0,Модель,f1-score
0,Логистическая регрессия без оптимизации,0.837
1,Логистическая регрессия с оптимизацией,0.845
2,SVM с оптимизацией,0.861
3,Случайный лес с оптимизацией,0.922


Лучший результат показал случайный лес. Поскольку в дз "Ансамблирование" аналогично подбирались оптимальные параметры алгоритмов, результат там был такой же.