In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()

In [None]:
print('Data has {} rows and {} columns.'.format(df.shape[0], df.shape[1]))

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), linewidths=0.2, linecolor='white', cmap='coolwarm', annot=True)

#### Time and Death_event are negatively related which means as Time(follow-up period) increase the Death_Event are less likely to occur.

#### Age and serum creatinine are postively realted. Clearly, with age heart failure chances increases and same for serum creatine.

#### Let us try to know the relation between sex and smoking by more exploration.

In [None]:
print(' Total males:',df['sex'].value_counts()[1],'\n','Total females in data:',df.sex.value_counts()[0],'\n','-'*60,'\n')

fig, ax = plt.subplots(1,3,sharey=True, figsize=(12,6))

sns.countplot(df['sex'], hue=df['DEATH_EVENT'], ax=ax[2], color='blue',saturation=0.1)
sns.countplot(df['smoking'], hue=df['sex'], ax=ax[0], color='yellow',saturation=0.5)
sns.countplot(df['DEATH_EVENT'], hue=df['smoking'], ax=ax[1], color='red', saturation=0.3)
ax[0].title.set_text('Smoking and Sex Relation')
ax[1].title.set_text('Smoking and Heart Failure Relation')
ax[2].title.set_text('Heart Failure and Sex Relation')

plt.show()

### From 1st graph we can see that the smokers are significantly more males than females. Although it looks that smoking does not has much impact on heart failure as more non-smokers are patient of heart failure.

### Also males are more likely to suffer from heart failure than females(from 3rd graph) but number of deaths in both sex can be said same considering the datasets is imbalanced as it has records of males twice as much as females.

In [None]:
sns.countplot(x=df['time'].value_counts(), hue=df['DEATH_EVENT'])

### This shows inverse relation between time and death event as the time increases the death event is less likely to occur.

In [None]:
plt.figure(figsize=(14,8))
sns.countplot(x=df['age'], hue=df['DEATH_EVENT'])
plt.title('Age effect on Heart Failure')
plt.xticks(rotation=45)
plt.show()

### The graph shows that people suffering from heart failure in 70s and 80s are more likely to die than people in 40s and 50s which recovered from it.

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,6))

sns.boxplot(y=df['ejection_fraction'], x= df['DEATH_EVENT'], ax=ax[0])
sns.boxplot(y=df['ejection_fraction'], x=df['DEATH_EVENT'], ax=ax[1])
ax[0].title.set_text('original Image')
ax[1].title.set_text('zoomed In')
ax[1].set_ylim([20,60])
plt.show()

### We can see that mean ejection fraction is low for patients who died compared to the one who recovered.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,6))

sns.boxplot(y=df['creatinine_phosphokinase'], x= df['DEATH_EVENT'], ax=ax[1])
sns.boxenplot(y=df['platelets'], x=df['DEATH_EVENT'],ax=ax[0])
ax[1].title.set_text('creatinine_phosphokinase')
ax[0].title.set_text('platelets')
plt.show()

## Training Models

In [None]:
def predict(data, model):
    
    # distributing data into train and test set.
    X = data.drop('DEATH_EVENT', axis=1)
    y = data['DEATH_EVENT']
    X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=101)

    # Fitting the model to data.
    model.fit(X_train, y_train)

    # Prediction
    predictions = model.predict(X_test)
    print('{} results :-'.format(model),'\n','-'*60,'\n','Accuracy_Score: ',accuracy_score(y_test, predictions))


In [None]:
predict(data=df, model=RandomForestClassifier())

In [None]:
predict(data=df, model=SVC())

In [None]:
predict(data=df, model=LogisticRegression(max_iter=1000))

#### Clearly Random Forest is a winner here with 0.98 accuracy.

#### Followed by Logistic Regression with 0.87.