In [91]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [92]:
heart_csv_data = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

heart_csv_data.head()

In [93]:
print("The data set has {} instances and {} features".format(heart_csv_data.shape[0], heart_csv_data.shape[1]))

print("The count of each value equlas\n", heart_csv_data.output.value_counts())

heart_csv_data.info()

In [94]:
heart_csv_data.describe().T

In [95]:
cols_dict = {}

for column in heart_csv_data.columns:
    cols_dict[column] = heart_csv_data[column].value_counts().shape[0]

cols_pd = pd.DataFrame(cols_dict, index=["unique_count"]).T
disc_cols = list(cols_pd[cols_pd.unique_count < 6].index)
cont_cols = list(cols_pd[cols_pd.unique_count >= 6].index)

In [96]:
sns.pairplot(heart_csv_data[cont_cols+["output"]], hue="output")

In [97]:
sns.pairplot(heart_csv_data[disc_cols], hue="output", diag_kind="hist")

In [98]:
fig, ax = plt.subplots(figsize=(10,10)) 

sns.heatmap(heart_csv_data[cont_cols+["output"]].corr(), annot=True, ax=ax)

In [99]:
fig, ax = plt.subplots(figsize=(10,10)) 

sns.heatmap(heart_csv_data[disc_cols].corr(), annot=True, ax=ax)

In [100]:
y = heart_csv_data.output
X = heart_csv_data.drop(["output"], axis=1)
disc_cols.remove("output")

In [101]:
X_dummy = pd.get_dummies(X, columns=disc_cols, drop_first=True)

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size=.20, stratify=y, random_state=42)

print(y_train.value_counts(),"\n", y_test.value_counts())

In [103]:
cols = X_train.columns
scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=cols)
X_test = pd.DataFrame(scaler.transform(X_test), columns=cols)


In [104]:
def best_five(grid):
    full_results = pd.DataFrame(grid.cv_results_).sort_values(["rank_test_score"])
    return full_results[["params", "mean_train_score", "mean_test_score"]].head()

In [105]:
def make_grid(estimator, param_grid):
    
    grid = GridSearchCV(estimator, param_grid=param_grid, refit=True, return_train_score=True, cv=2)

    grid.fit(X_train, y_train)
    return grid



In [106]:
svc_param_grid = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel":["linear", "rbf", "sigmoid"]
}

svc_grid = make_grid(SVC(random_state=42), svc_param_grid)

best_five(svc_grid)

In [107]:
lr_param_grid = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "max_iter":[100, 200, 500]
}

lr_grid = make_grid(LogisticRegression(random_state=42), lr_param_grid)

best_five(lr_grid)

In [108]:
dt_param_grid = {
    "max_depth": [2, 3, 4, 5, 6, 7],
    "max_features": ["sqrt", "log2"],
    "min_samples_leaf":[1, 2, 3]
}

dt_grid = make_grid(DecisionTreeClassifier(random_state=42), dt_param_grid)
best_five(dt_grid)

In [109]:
rf_param_grid = {
    "max_depth": [2, 3, 4, 5, 6, 7],
    "min_samples_leaf":[1, 2, 3],
    "n_estimators":[100, 200, 500]
}

rf_grid = make_grid(RandomForestClassifier(random_state=42), rf_param_grid)

best_five(dt_grid)


In [110]:
print("Best model: Logistic Regression")
print("Best parameters", lr_grid.best_params_)
print(classification_report(y_test,lr_grid.predict(X_test)))