In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from scipy import stats

In [None]:
pip install researchpy

In [None]:
import researchpy as rp

In [None]:
df = pd.read_csv('../input/heart-failure-prediction/heart.csv')

In [None]:
df.info()

In [None]:
plt.figure(figsize=(15,10))
for col,i in zip(df.columns, range(1,13)):
    plt.subplot(4,4,i)
    plt.hist(x= df[col])
    plt.title(col)
plt.subplots_adjust(wspace=0.3, hspace=0.5)

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot= True)

## Is Age influencing Heart Disease?

In [None]:
g1 = df[(df.HeartDisease == 0)].Age
g2 = df[(df.HeartDisease == 1)].Age


In [None]:
summary, results = rp.ttest(group1=g1, group2=g2, group1_name='HD 0', group2_name='HD 1')

summary

In [None]:
plt.ylabel('Heart Disease')
plt.plot(summary.iloc[0,5:7].values,[0,0], marker='o')
plt.plot(summary.iloc[1,5:7].values,[1,1], marker= 'o')
plt.title('Interval plot')
plt.show()


In [None]:
results

## What Gender has higher HD rate?

In [None]:
fhd1 = df[(df.HeartDisease == 1)&(df.Sex == 'F')].HeartDisease.sum()
amt_f = df[(df.Sex == 'F')].HeartDisease.count()
mhd1 = df[(df.HeartDisease == 1)&(df.Sex == 'M')].HeartDisease.sum()
amt_m = df[(df.Sex == 'M')].HeartDisease.count()

plt.figure(figsize=(10,10))
plt.subplot(1,2,1, title = f'Female HD rate = {fhd1/amt_f:.2}')
plt.pie(x=[amt_f,fhd1], labels=[f'Total = {amt_f}',f'Heart Disease = {fhd1}'])
plt.subplot(1,2,2, title = f'Male HD rate = {mhd1/amt_m :.2}')
plt.pie(x=[amt_m,mhd1], labels=[f'Total = {amt_m}',f'Heart Disease = {mhd1}'])
plt.show()

## What Chest pain got higher cases of HD?

In [None]:
ata = df[(df.ChestPainType == 'ATA')&(df.HeartDisease == 1)].HeartDisease.sum()
nap = df[(df.ChestPainType == 'NAP')&(df.HeartDisease == 1)].HeartDisease.sum()
asy = df[(df.ChestPainType == 'ASY')&(df.HeartDisease == 1)].HeartDisease.sum()
ta = df[(df.ChestPainType == 'TA')&(df.HeartDisease == 1)].HeartDisease.sum()

plt.figure(figsize=(8,8))
plt.title('Chest pain')
sns.barplot(x=df.ChestPainType.unique(), y= [ata, nap, asy, ta])

In [None]:
df1 = pd.get_dummies(df)
df1

In [None]:
min_max_scaler = MinMaxScaler()
transformed = min_max_scaler.fit_transform(df1)
df2 = pd.DataFrame(transformed, columns= df1.columns)
df2

## Spliting data

In [None]:
y= df2.HeartDisease
x= df2.drop(labels= 'HeartDisease', axis= 1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state= 0)

## Creating model

In [None]:
model = LogisticRegression(random_state=0)
model.fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))