In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/personal-key-indicators-of-heart-disease/heart_2020_cleaned.csv', encoding='utf-8')
df.head()

In [None]:
df.info()

In [None]:
data = df.copy()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for column in df.columns:
    if df[column].dtype=='object':
        df[column] = le.fit_transform(df[column])

In [None]:
df['HeartDisease'].value_counts()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(20,7))
sns.heatmap(df.corr() , annot = True, cmap = "Blues")

In [None]:
df.plot(kind='box', subplots = True, figsize = (10,10), layout = (5,5))

## Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 

sc = StandardScaler()
x = df.drop(['HeartDisease'], axis = 1)
y = df['HeartDisease']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)

accuracy_score(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 5, max_depth=5 ,max_features= 17)
rf.fit(x_train , y_train)
rf_pred = rf.predict(x_test)

accuracy_score(y_test, rf_pred)

# Handling Techniques

## Reassembling data

![image.png](attachment:2e78a38a-7de0-4a1d-acd3-65781c232caa.png)

### Random Undersampling

**Randomly removing samples from the majority class to balance the dataset.**

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler() #sampling_strategy=1
x_res, y_res = rus.fit_resample(x_train, y_train)

In [None]:
#before Random undersampling
y_train.value_counts()

In [None]:
#after Random undersampling
y_res.value_counts()

In [None]:
#modeling
logreg.fit(x_res, y_res)
y_pred = logreg.predict(x_test)
accuracy_score(y_test, y_pred)

### Random Oversampling

**Randomly duplicating samples from the minority class to balance the dataset.**

In [None]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler() #sampling_strategy='minority'
x_ran, y_ran = os.fit_resample(x_train, y_train)

In [None]:
#before random oversampling
y_train.value_counts()

In [None]:
#after random oversampling
y_ran.value_counts()

In [None]:
#modeling
logreg.fit(x_ran, y_ran)
y_pred = logreg.predict(x_test)
accuracy_score(y_test, y_pred)

### Undersampling using Tomek Links

**Tomek links are pairs of examples of opposite classes in close vicinity.
In this algorithm, we end up removing the majority element from the Tomek link, which provides a better decision boundary for a classifier.**

![image.png](attachment:65821e2d-bf4d-4437-aa32-ec7a33c32c90.png)

In [None]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
x_res_tl, y_res_tl = tl.fit_resample(x_train, y_train)

In [None]:
tl.get_params()

In [None]:
#before random oversampling
y_train.value_counts()

In [None]:
#after random oversampling
y_res_tl.value_counts()

In [None]:
#modeling
logreg.fit(x_res_tl, y_res_tl)
y_pred = logreg.predict(x_test)
accuracy_score(y_test, y_pred)

### Generate Synthetic Samples

**SMOTE or Synthetic Minority Oversampling Technique is a popular algorithm to creates sythetic observations of the minority class.**
**(Synthetic Minority Over-sampling Technique):
Generating synthetic samples for the minority class based on the characteristics of existing samples.**

![image.png](attachment:f77d76cc-777a-4467-9dfb-1ae1ba772e6f.png)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
x_res_sm, y_res_sm = sm.fit_resample(x_train, y_train)

In [None]:
#before random oversampling
y_train.value_counts()

In [None]:
#after random oversampling
y_res_sm.value_counts()

In [None]:
#modeling
logreg.fit(x_res_sm, y_res_sm)
y_pred = logreg.predict(x_test)
accuracy_score(y_test, y_pred)

### SMOTE-ENN

**Over-sampling using SMOTE and cleaning using ENN.
Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours.**

In [None]:
from imblearn.combine import SMOTEENN
sme = SMOTEENN()
x_sme, y_sme = sme.fit_resample(x_train, y_train)

In [None]:
#before random oversampling
y_train.value_counts()

In [None]:
#after random oversampling
y_res_sm.value_counts()

In [None]:
#modeling
logreg.fit(x_sme, y_sme)
y_pred = logreg.predict(x_test)
accuracy_score(y_test, y_pred)