MODEL 2

Cleaning the data before modelling and understanding contribution of each column to the model

DATA PRE-PROCESSING

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

file_path = 'downloads/heart/heart_2020_cleaned.csv'
df = pd.read_csv(file_path)

In [2]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [3]:
df.count().T

HeartDisease        319795
BMI                 319795
Smoking             319795
AlcoholDrinking     319795
Stroke              319795
PhysicalHealth      319795
MentalHealth        319795
DiffWalking         319795
Sex                 319795
AgeCategory         319795
Race                319795
Diabetic            319795
PhysicalActivity    319795
GenHealth           319795
SleepTime           319795
Asthma              319795
KidneyDisease       319795
SkinCancer          319795
dtype: int64

In [4]:
row_duplicate_counts = df[df.duplicated].shape[0]
print(row_duplicate_counts)


18078


In [5]:
df_clean = df.drop_duplicates()

MODEL PREPARATION

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301717 entries, 0 to 319794
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   BMI               301717 non-null  float64
 1   Smoking           301717 non-null  object 
 2   AlcoholDrinking   301717 non-null  object 
 3   Stroke            301717 non-null  object 
 4   PhysicalHealth    301717 non-null  int64  
 5   MentalHealth      301717 non-null  int64  
 6   DiffWalking       301717 non-null  object 
 7   Sex               301717 non-null  object 
 8   AgeCategory       301717 non-null  object 
 9   Race              301717 non-null  object 
 10  Diabetic          301717 non-null  object 
 11  PhysicalActivity  301717 non-null  object 
 12  GenHealth         301717 non-null  object 
 13  SleepTime         301717 non-null  int64  
 14  Asthma            301717 non-null  object 
 15  KidneyDisease     301717 non-null  object 
 16  SkinCancer        30

In [6]:
num_rows = len(df_clean)
print(num_rows)

301717


In [7]:
heart_disease_proportions = df_clean['HeartDisease'].value_counts()
print(heart_disease_proportions)

No     274456
Yes     27261
Name: HeartDisease, dtype: int64


In [8]:
#Get the counts of each 'BMI' value
bmi_proportions = df_clean['BMI'].value_counts()

#Identify values with counts less than 100
values_to_drop = bmi_proportions[bmi_proportions < 100].index

#Drop rows with those values
df_clean = df_clean[~df_clean['BMI'].isin(values_to_drop)]


In [9]:
num_rows = len(df_clean)
print(num_rows)

252350


In [10]:
smoking_proportions = df_clean['Smoking'].value_counts()
print(smoking_proportions)

No     146007
Yes    106343
Name: Smoking, dtype: int64


In [11]:
Alcohol_drinking_proportions = df_clean['AlcoholDrinking'].value_counts()
print(Alcohol_drinking_proportions)

No     233598
Yes     18752
Name: AlcoholDrinking, dtype: int64


In [12]:
# Drop the 'AlcoholDrinking' column
df_clean = df_clean.drop(columns=['AlcoholDrinking'])

In [13]:
Stroke_proportions = df_clean['Stroke'].value_counts()
print(Stroke_proportions)

No     242654
Yes      9696
Name: Stroke, dtype: int64


In [14]:
df_clean = df_clean.drop(columns=['Stroke'])

In [15]:
physical_health_proportions = df_clean['PhysicalHealth'].value_counts()
values_to_drop = physical_health_proportions[physical_health_proportions < 100].index
df_clean = df_clean[~df_clean['PhysicalHealth'].isin(values_to_drop)]


In [16]:
PhysicalHealth_proportions = df_clean['PhysicalHealth'].value_counts()
print(PhysicalHealth_proportions)

0     177341
30     14896
2      12506
1       8919
3       7114
5       6258
10      4314
15      3903
7       3794
4       3625
20      2441
14      2276
6       1015
25       895
8        725
21       472
12       463
28       339
29       151
9        139
18       134
16       100
Name: PhysicalHealth, dtype: int64


In [17]:
mental_health_proportions = df_clean['MentalHealth'].value_counts()
values_to_drop = mental_health_proportions[mental_health_proportions < 100].index
df_clean = df_clean[~df_clean['MentalHealth'].isin(values_to_drop)]

In [18]:
MentalHealth_proportions = df_clean['MentalHealth'].value_counts()
print(MentalHealth_proportions)

0     157627
2      13761
30     13650
5      11911
3       8750
10      8715
15      8118
1       7835
7       4626
4       4500
20      4444
14      1630
25      1556
6       1238
8        873
12       596
28       415
21       286
29       264
18       180
9        166
16       115
Name: MentalHealth, dtype: int64


In [19]:
num_rows = len(df_clean)
print(num_rows)

251256


In [20]:
DiffWalking_proportions = df_clean['DiffWalking'].value_counts()
print(DiffWalking_proportions)

No     218328
Yes     32928
Name: DiffWalking, dtype: int64


In [21]:
df_clean = df_clean.drop(columns=['DiffWalking'])

In [22]:
Sex_proportions = df_clean['Sex'].value_counts()
print(Sex_proportions)

Female    131259
Male      119997
Name: Sex, dtype: int64


In [23]:
AgeCategory_proportions = df_clean['AgeCategory'].value_counts()
print(AgeCategory_proportions)

65-69          26043
60-64          25711
70-74          24303
55-59          22850
50-54          19583
80 or older    19446
75-79          17213
45-49          17085
18-24          16885
40-44          16669
35-39          16445
30-34          15112
25-29          13911
Name: AgeCategory, dtype: int64


In [24]:
Race_proportions = df_clean['Race'].value_counts()
print(Race_proportions)

White                             191968
Hispanic                           21745
Black                              17762
Other                               8904
Asian                               6652
American Indian/Alaskan Native      4225
Name: Race, dtype: int64


In [25]:
# Define the races to keep
races_to_keep = ['White', 'Black']

# Filter the DataFrame to keep only the specified races
df_clean = df_clean[df_clean['Race'].isin(races_to_keep)]

In [26]:
Race_proportions = df_clean['Race'].value_counts()
print(Race_proportions)

White    191968
Black     17762
Name: Race, dtype: int64


In [27]:
Diabetic_proportions = df_clean['Diabetic'].value_counts()
print(Diabetic_proportions)

No                         178352
Yes                         25722
No, borderline diabetes      4028
Yes (during pregnancy)       1628
Name: Diabetic, dtype: int64


In [28]:
# Replace 'No, borderline diabetes' with 'No'
df_clean['Diabetic'].replace('No, borderline diabetes', 'No', inplace=True)

# Drop rows where 'Diabetic' is 'Yes (during pregnancy)'
df_clean = df_clean[df_clean['Diabetic'] != 'Yes (during pregnancy)']

In [29]:
Diabetic_proportions = df_clean['Diabetic'].value_counts()
print(Diabetic_proportions)

No     182380
Yes     25722
Name: Diabetic, dtype: int64


In [30]:
df_clean = df_clean.drop(columns=['Diabetic'])

In [31]:
num_rows = len(df_clean)
print(num_rows)

208102


In [32]:
PhysicalActivity_proportions = df_clean['PhysicalActivity'].value_counts()
print(PhysicalActivity_proportions)

Yes    162628
No      45474
Name: PhysicalActivity, dtype: int64


In [33]:
GenHealth_proportions = df_clean['GenHealth'].value_counts()
print(GenHealth_proportions)

Very good    77259
Good         61294
Excellent    41685
Fair         21058
Poor          6806
Name: GenHealth, dtype: int64


In [34]:
SleepTime_proportions = df_clean['SleepTime'].value_counts()
print(SleepTime_proportions)

7     63618
8     63276
6     43405
5     12064
9     11373
10     5355
4      4693
12     1416
3      1193
2       497
1       345
11      270
14      146
16      144
15      114
18       69
13       52
20       38
24       14
17       11
22        6
23        2
19        1
Name: SleepTime, dtype: int64


In [35]:
# Define the threshold
threshold = 1000

# Get the sleep times with less than 100 people
sleep_times_to_drop = SleepTime_proportions[SleepTime_proportions < threshold].index

# Drop rows where 'SleepTime' is in sleep_times_to_drop
df_clean = df_clean[~df_clean['SleepTime'].isin(sleep_times_to_drop)]

SleepTime_proportions = df_clean['SleepTime'].value_counts()
print(SleepTime_proportions)

7     63618
8     63276
6     43405
5     12064
9     11373
10     5355
4      4693
12     1416
3      1193
Name: SleepTime, dtype: int64


In [36]:
Asthma_proportions = df_clean['Asthma'].value_counts()
print(Asthma_proportions)

No     178769
Yes     27624
Name: Asthma, dtype: int64


In [37]:
KidneyDisease_proportions = df_clean['KidneyDisease'].value_counts()
df_clean['KidneyDisease'] = 'No'
print(KidneyDisease_proportions)

No     198566
Yes      7827
Name: KidneyDisease, dtype: int64


In [38]:
df_clean['SkinCancer'] = 'No'
SkinCancer_proportions = df_clean['SkinCancer'].value_counts()
print(SkinCancer_proportions)

No    206393
Name: SkinCancer, dtype: int64


In [39]:
num_rows = len(df_clean)
print(num_rows)

206393


In [40]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206393 entries, 1 to 319603
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      206393 non-null  object 
 1   BMI               206393 non-null  float64
 2   Smoking           206393 non-null  object 
 3   PhysicalHealth    206393 non-null  int64  
 4   MentalHealth      206393 non-null  int64  
 5   Sex               206393 non-null  object 
 6   AgeCategory       206393 non-null  object 
 7   Race              206393 non-null  object 
 8   PhysicalActivity  206393 non-null  object 
 9   GenHealth         206393 non-null  object 
 10  SleepTime         206393 non-null  int64  
 11  Asthma            206393 non-null  object 
 12  KidneyDisease     206393 non-null  object 
 13  SkinCancer        206393 non-null  object 
dtypes: float64(1), int64(3), object(10)
memory usage: 23.6+ MB


In [41]:
df_clean.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
PhysicalHealth      0
MentalHealth        0
Sex                 0
AgeCategory         0
Race                0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [42]:
df_clean.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'PhysicalHealth', 'MentalHealth',
       'Sex', 'AgeCategory', 'Race', 'PhysicalActivity', 'GenHealth',
       'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [43]:
X = df_clean.drop(columns=['HeartDisease'],axis =1)
y = df_clean['HeartDisease']

In [44]:
X['AgeCategory'] = X['AgeCategory'].str.extract('(\d+)').astype(int)

In [45]:
binary_cols = ['Smoking','PhysicalActivity','Asthma','KidneyDisease','SkinCancer']

for col in binary_cols:
    X[col] =LabelEncoder().fit_transform(X[col])

In [46]:
categorical_columns = ['Sex', 'Race', 'GenHealth']
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
X = X_encoded

In [47]:
X.head()

Unnamed: 0,BMI,Smoking,PhysicalHealth,MentalHealth,AgeCategory,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer,Sex_Male,Race_White,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good
1,20.34,0,0,0,80,1,7,0,0,0,0,1,0,0,0,1
2,26.58,1,20,30,65,1,8,1,0,0,1,1,1,0,0,0
3,24.21,0,0,0,75,0,6,0,0,0,0,1,0,1,0,0
4,23.71,0,28,0,40,1,8,0,0,0,0,1,0,0,0,1
5,28.87,1,6,0,75,0,12,0,0,0,0,0,1,0,0,0


In [48]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

In [49]:
from imblearn.under_sampling import RandomUnderSampler

# Initialize RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)

# Apply undersampling to the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

In [50]:
# Convert y_train_resampled to a pandas Series (if it's not already)
y_train_resampled_series = pd.Series(y_train_resampled)

# Count the proportions of 'Yes' and 'No'
proportions = y_train_resampled_series.value_counts()

# Print the proportions
print(proportions)

No     15154
Yes    15154
Name: HeartDisease, dtype: int64


In [51]:
model_performance = {}

In [52]:
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Train the model on the resampled data
decision_tree_model.fit(X_train_resampled, y_train_resampled)

# Predict using the test set
y_pred = decision_tree_model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Store the performance metrics in a dictionary
model_performance['Decision Tree'] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }


In [53]:
# Method 1 of visualizing metrices
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

          No       0.95      0.66      0.77     37476
         Yes       0.16      0.63      0.25      3803

    accuracy                           0.65     41279
   macro avg       0.55      0.64      0.51     41279
weighted avg       0.87      0.65      0.73     41279



In [54]:
# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the resampled data
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Store performance metrics in the model_performance dictionary
model_performance['Random Forest'] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }


In [55]:
# Initialize Random forest model
model = RandomForestClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Store performance metrics in the model_performance dictionary
model_performance['Logistic Regression'] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }


In [56]:
# Initialize Gradient Boosting model
model = GradientBoostingClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Store performance metrics in the model_performance dictionary
model_performance['Gradient Boosting'] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }


In [57]:
# Initialize neural network model!
model = MLPClassifier(max_iter=1000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Store performance metrics in the model_performance dictionary
model_performance['Neural Network'] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }


In [58]:
for model_name, metrics in model_performance.items():
    print(f'Model: {model_name}')
    print(f'Accuracy: {metrics["Accuracy"]}')
    print(f'Precision: {metrics["Precision"]}')
    print(f'Recall: {metrics["Recall"]}')
    print(f'F1 Score: {metrics["F1 Score"]}')
    print('-' * 30)

Model: Decision Tree
Accuracy: 0.6531165968167834
Precision: 0.1565643370346179
Recall: 0.6302918748356561
F1 Score: 0.25082404646052425
------------------------------
Model: Random Forest
Accuracy: 0.720802345018048
Precision: 0.21895472412287087
Recall: 0.790954509597686
F1 Score: 0.3429679037683142
------------------------------
Model: Logistic Regression
Accuracy: 0.886867414423799
Precision: 0.24811156304474144
Recall: 0.11227977912174598
F1 Score: 0.15459811730629977
------------------------------
Model: Gradient Boosting
Accuracy: 0.9085733666028731
Precision: 0.5700483091787439
Recall: 0.031028135682356035
F1 Score: 0.05885286783042394
------------------------------
Model: Neural Network
Accuracy: 0.907095617626396
Precision: 0.4800498753117207
Recall: 0.10123586642124638
F1 Score: 0.16720955483170466
------------------------------


CONcLUSION:

Compared to model 1 most models are promising except for gradient boosting.

Most columns that were termed insignificant/not much weight were either dropped or shifted to the weighty side.

Climax not achieved but can be achieved.