In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix

In [None]:
dataset = pd.read_csv("https://raw.githubusercontent.com/benvictoria17/DataAnalytics/master/dataset/Personal%20Key%20Indicators%20of%20Heart%20Disease/heart_2020_cleaned.csv")
dataset.head(3)

In [None]:
dataset.describe(include="all")

In [None]:
dataset.dtypes

In [None]:
dataset.info()

In [None]:
dataset.nunique()

In [None]:
with_hd = dataset[dataset['HeartDisease']=='Yes'].sample(n=10000, random_state=42)
without_hd = dataset[dataset['HeartDisease']=='No'].sample(n=10000, random_state=42)
sample = pd.concat([with_hd, without_hd])
sample.shape

In [None]:
sample.head(5)

In [None]:
sns.set_palette('rocket_r')
plt.figure(figsize=(15, 4))
sns.stripplot(data=sample, x='HeartDisease', y='BMI')
plt.title('BMI vs Heart Disease')
plt.xlabel('HeartDisease')
plt.ylabel('BMI')
plt.show()

In [None]:
sns.catplot(data=sample, x='HeartDisease', y='BMI', kind='box')
plt.title('BMI vs Heart Disease')
plt.ylabel('BMI')
plt.xlabel('Heart Disease')
plt.show()

In [None]:
sns.set_palette("Set2")
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='Smoking', hue='HeartDisease',)
plt.title('Smoking vs Heart Disease')
plt.xlabel('Smoking')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Accent')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='AlcoholDrinking', hue='HeartDisease',)
plt.title('Alcohol vs Heart Disease')
plt.xlabel('Alcohol Drinking')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
drink_alcohol = dataset[dataset['AlcoholDrinking'] == 'Yes'].sample(10000)
plt.figure(figsize=(15,7))
sns.countplot(data=drink_alcohol, x='AlcoholDrinking', hue='HeartDisease')
plt.title('AlcoholDrinking vs Heart Disease')
plt.xlabel('Alcohol Drinking')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('viridis')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='PhysicalHealth', hue='HeartDisease')
plt.title('PhysicalHealth vs Heart Disease')
plt.xlabel('PhysicalHealth')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('crest')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='MentalHealth', hue='HeartDisease')
plt.title('MentalHealth vs Heart Disease')
plt.xlabel('MentalHealth')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('rocket')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='DiffWalking', hue='HeartDisease')
plt.title('Diff Walking vs Heart Disease')
plt.xlabel('Diff Walking')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Set1')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='Sex', hue='HeartDisease')
plt.title('Sex vs Heart Disease')
plt.xlabel('Sex')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Pastel2')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='AgeCategory', hue='HeartDisease', order=['18-24', '25-29', '30-34', '35-39', '40-44',
                                                                      '45-49', '50-54', '55-59', '60-64', '65-69',
                                                                      '70-74', '75-79', '80 or older'])
plt.title('Age Category vs Heart Disease')
plt.xlabel('Age Category')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Pastel1')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='Race', hue='HeartDisease')
plt.title('Race vs Heart Disease')
plt.xlabel('Race')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('PRGn_r')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='Diabetic', hue='HeartDisease')
plt.title('Diabetic vs Heart Disease')
plt.xlabel('Diabetic')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Set3')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='PhysicalActivity', hue='HeartDisease')
plt.title('Physical Activity vs Heart Disease')
plt.xlabel('Physical Activity')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Set3_r')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='GenHealth', hue='HeartDisease')
plt.title('Gen Health vs Heart Disease')
plt.xlabel('Gen Health')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Set3_r')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='SleepTime', hue='HeartDisease')
plt.title('Sleep Time vs Heart Disease')
plt.xlabel('Sleep Time')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Spectral')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='Asthma', hue='HeartDisease')
plt.title('Asthma vs Heart Disease')
plt.xlabel('Asthma')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('Spectral_r')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='KidneyDisease', hue='HeartDisease')
plt.title('Kidney Disease vs Heart Disease')
plt.xlabel('Kidney Disease')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
sns.set_palette('RdPu')
plt.figure(figsize=(15,7))
sns.countplot(data=sample, x='SkinCancer', hue='HeartDisease')
plt.title('Skin Cancer vs Heart Disease')
plt.xlabel('Skin Cancer')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
X = sample.iloc[:, 1:]
y = sample.iloc[:, 0]
X.head(3)

In [None]:
y.head(3)

In [None]:
X_cat = X[['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory',
          'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']]
X_cat.head(2)

In [None]:
X_num = X[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]
X_num.head(2)

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])],remainder='passthrough')
X_cat_encoded = ct.fit_transform(X_cat)
X_cat_encoded.shape

In [None]:
type(X_cat_encoded)

In [None]:
X_cat_encoded = X_cat_encoded.toarray()
X_cat_encoded.shape

In [None]:
X_cat_encoded[1]

In [None]:
X_cat_df = pd.DataFrame(X_cat_encoded)
X_cat_df.head(3)

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)
y.shape

In [None]:
y

In [None]:
sc = StandardScaler()
X_num_scaled = sc.fit_transform(X_num)
X_num_scaled.shape

In [None]:
type(X_num_scaled)

In [None]:
X_num_scaled_df = pd.DataFrame(X_num_scaled, columns=['46', '47', '48', '49'])
X_num_scaled_df.head(3)

In [None]:
X_cat_df.reset_index(drop=True, inplace=True)
X_num_scaled_df.reset_index(drop=True, inplace=True)
X = pd.concat([X_cat_df, X_num_scaled_df], axis=1)
X.head(3)

In [None]:
X.shape

In [None]:
X = X.values
X

In [None]:
y

In [None]:
y.shape

In [None]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)
scores_log_reg = cross_val_score(log_reg, X, y, cv=10)
print(f'Mean: {np.mean(scores_log_reg)}')
print(f'Standard Deviation: {np.std(scores_log_reg)}')

In [None]:
knn = KNeighborsClassifier(n_neighbors=23)
scores_knn = cross_val_score(knn, X, y, cv=10)
print(f'Mean: {np.mean(scores_knn)}')
print(f'Standard Deviation: {np.std(scores_knn)}')

In [None]:
svc = SVC(kernel='linear')
scores_svc = cross_val_score(svc, X, y, cv=10)
print(f'Mean: {np.mean(scores_svc)}')
print(f'Standard Deviation: {np.std(scores_svc)}')

In [None]:
kernel_svc = SVC(kernel='rbf')
scores_kernel_svc = cross_val_score(kernel_svc, X, y, cv=10)
print(f'Mean: {np.mean(scores_kernel_svc)}')
print(f'Standard Deviation: {np.std(scores_kernel_svc)}')

In [None]:
naive_bayes = GaussianNB()
scores_naive_bayes = cross_val_score(naive_bayes, X, y, cv=10)
print(f'Mean: {np.mean(scores_naive_bayes)}')
print(f'Standard Deviation: {np.std(scores_naive_bayes)}')

In [None]:
dtc = DecisionTreeClassifier()
scores_dtc = cross_val_score(dtc, X, y, cv=10)
print(f'Mean: {np.mean(scores_dtc)}')
print(f'Standard Deviation: {np.std(scores_dtc)}')

In [None]:
rfc = RandomForestClassifier(n_estimators=70)
scores_rfc = cross_val_score(rfc, X, y, cv=10)
print(f'Mean: {np.mean(scores_rfc)}')
print(f'Standard Deviation: {np.std(scores_rfc)}')