In [154]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC

# Importing data

In [155]:
data = pd.read_csv('./heart.csv')

In [156]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Training various models

In [157]:
# Setting cross-validation and train-test-split
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)
X_train, X_valid, y_train, y_valid = train_test_split(data[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']], data['target'], test_size=0.3)

In [158]:
# KNN
knn = KNeighborsClassifier(n_neighbors=100, n_jobs=1)
cv_score_knn = cross_val_score(knn, X_train, y_train, cv=skf, n_jobs=-1)
print('CV score:', np.round(np.mean(cv_score_knn), decimals=3))
#knn_valid_score = knn.score(X_valid, y_valid)

CV score: 0.566


In [159]:
#LogisticRegression
logreg = LogisticRegression(penalty = 'l2')
cv_score_logreg = cross_val_score(logreg, X_train, y_train, cv=skf, n_jobs=-1)
print('CV score:', np.round(np.mean(cv_score_logreg), decimals=3))

CV score: 0.816


In [160]:
# Логистическая регрессия с использованием стохастического градиентного спуска
sgd_logit = SGDClassifier(loss = 'log',n_jobs = -1, max_iter=3)
cv_score_sgd_logit = cross_val_score(sgd_logit, X_train, y_train, cv=skf, n_jobs=-1)
print('CV score:', np.round(np.mean(knn_cv_score), decimals=3))

CV score: 0.594


In [161]:
# Линейный SVM c C = 1
svm = LinearSVC(C = 1.0, random_state=17)
cv_score_svm = cross_val_score(svm, X_train, y_train, cv=skf, n_jobs=-1)
print('CV Score:', np.round(np.mean(cv_score_svm), decimals = 3))

CV Score: 0.661


In [162]:
# Decision Tree
clf = DecisionTreeClassifier()
#clf = clf.fit(X_train, y_train)
cv_score_clf = cross_val_score(clf, X_train, y_train, cv=skf, n_jobs=-1)
print('CV Score:', np.round(np.mean(cv_score_clf), decimals = 3))

CV Score: 0.712


In [163]:
# RandomForest
forest = RandomForestClassifier(n_estimators=50)
cv_scores_rf = cross_val_score(forest, X_train, y_train, cv=skf)
print('CV Score:', np.round(np.mean(cv_scores_rf), decimals = 3))

CV Score: 0.831


# Accessing best model's quality on test data

In [164]:
forest.fit(X_train, y_train)
y_pred = forest.predict(X_valid)
accuracy_score(y_valid, y_pred)

0.8131868131868132

In [165]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_valid)
accuracy_score(y_valid, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8571428571428571

In [166]:
# На тестовых данных лучше работает логистическая регрессия

# Как связаны болезни сердца и содержание сахара в крови?

In [168]:
logreg.coef_

array([[ 0.02405191, -1.31001585,  0.65654825, -0.00990938, -0.00586545,
        -0.15330677,  0.13677533,  0.02766371, -0.77730605, -0.3518421 ,
         0.67757279, -0.8384074 , -0.9383366 ]])

In [169]:
logreg.coef_[0][5]

-0.15330677246836505

In [170]:
# Cудя по значению коэффициента в модели логистической регрессии, 
# наличие повышенного blood sugar > 120 mg/dl немного снижает вероятность диагноза заболевания сердца.

In [171]:
# Если посмотреть на данные:
data.groupby('fbs').target.mean()

fbs
0    0.550388
1    0.511111
Name: target, dtype: float64

In [None]:
# У пациентов без заболевания сердца наличие повышенного blood sugar > 120 mg/dl встречается чаще