In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, accuracy_score
from sklearn.metrics import recall_score

In [2]:
df = pd.read_csv('heart_2020_cleaned.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
print("Shape of dataset:", df.shape)
df.info()

Shape of dataset: (319795, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  ob

In [None]:
sns.countplot(x=df['HeartDisease'])
plt.show()

In [None]:
print("Ratio of count of class 'No' to class 'Yes':", len(df[df['HeartDisease']=='No'])/len(df[df['HeartDisease']=='Yes']))

In [None]:
counts = df['Race'].value_counts()
counts = dict(counts)
counts

In [None]:
sns.countplot(x = df["Race"])

In [None]:
total = 0
nonwhite = 0

for race, num in counts.items():
    if race != "White":
        nonwhite += num
    total += num

percent_nonwhite = float(nonwhite) / total
print (percent_nonwhite)

In [None]:
num_races = len(counts.items())

target_dist_1 = np.array([1 - (percent_nonwhite)] + [percent_nonwhite / (num_races - 1) for i in range (num_races - 1)])
target_dist_1

In [None]:
target_dist_4 = np.array([1.0 / num_races for i in range (num_races)])

target_dist_2 = (target_dist_1 * 2 + target_dist_4) / 3
target_dist_3 = (target_dist_1 + target_dist_4 * 2) / 3

In [None]:
counts.keys()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)

ax1.bar(counts.keys(), target_dist_1)
ax2.bar(counts.keys(), target_dist_2)
ax3.bar(counts.keys(), target_dist_3)
ax4.bar(counts.keys(), target_dist_4)

plt.show()

# Remove Duplicates

In [None]:
print("Initial shape:", df.shape)
print("Duplicates in dataset:", df.duplicated().sum())

In [None]:
df.drop_duplicates(inplace=True)

print("New shape:", df.shape)
print("Duplicates in dataset:", df.duplicated().sum())

In [None]:
counts = df['Race'].value_counts()
counts = dict(counts)
counts

In [None]:
total = 0
nonwhite = 0

for race, num in counts.items():
    if race != "White":
        nonwhite += num
    total += num

percent_nonwhite = float(nonwhite) / total
print (percent_nonwhite)

In [None]:
num_races = len(counts.items())

target_dist_1 = np.array([1 - (percent_nonwhite)] + [percent_nonwhite / (num_races - 1) for i in range (num_races - 1)])
target_dist_4 = np.array([1.0 / num_races for i in range (num_races)])

target_dist_2 = (target_dist_1 * 2 + target_dist_4) / 3
target_dist_3 = (target_dist_1 + target_dist_4 * 2) / 3

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)

ax1.bar(counts.keys(), target_dist_1)
ax2.bar(counts.keys(), target_dist_2)
ax3.bar(counts.keys(), target_dist_3)
ax4.bar(counts.keys(), target_dist_4)

plt.show()

In [None]:
df[df["Race"] == "Asian"]

In [None]:
df.columns

In [None]:
['Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 
       'Asthma', 'KidneyDisease', 'SkinCancer']

# Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

data=df.copy()
le = LabelEncoder()

col = data[['HeartDisease', 'Smoking', 'AlcoholDrinking','AgeCategory', 'Stroke', 'DiffWalking','Race', 'Sex','PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer','GenHealth' ,'Diabetic']]
for i in col:
    data[i] = le.fit_transform(data[i])
data.head()

# Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

continuous_vars=df.select_dtypes('float64').columns
cols = list(continuous_vars)
data[cols] = std_scaler.fit_transform(data[cols])

data

In [None]:
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(data, test_size=0.2)

In [None]:
training_data.size, testing_data.size

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(data, test_size=0.2)

In [None]:
training_data.reset_index(inplace=True, drop=True)
training_data

In [None]:
testing_data.reset_index(inplace=True, drop=True)
testing_data

In [None]:
# class count
class_counts = training_data['Race'].value_counts()

# Separate class
class_0 = training_data[training_data['Race'] == 5]
class_1 = training_data[training_data['Race'] != 5]

print('Class 0 [Race = White]:', class_0.shape)
print('Class 1 [Race != White]:', class_1.shape)

In [None]:
training_data["Race"]

In [None]:
counts = training_data['Race'].value_counts()
counts = dict(counts)
counts
# total = 241103

In [None]:
race = {5: "White", 3: "Hispanic", 2: "Black", 4: "Other", 1: "Asian", 0: "American Indian/Alaskan Native"}

## Random oversampling

In [None]:
orig_white = counts[5]/sum(training_data['Race'].value_counts())
print("Orignal % white:", orig_white)

orig_propors = {}

for i in race.keys():
    orig_propors[i] = counts[i]/sum(training_data['Race'].value_counts())
                            
orig_propors

### Stage 1

In [None]:
propor_1 = (1-orig_white)/5
race_proportions_1 = {5: orig_white, 3: propor_1, 2: propor_1, 4: propor_1, 1: propor_1, 0: propor_1}
race_proportions_1

In [None]:
# Separate class
class_5 = training_data[training_data['Race'] == 5]
class_3 = training_data[training_data['Race'] == 3]
class_2 = training_data[training_data['Race'] == 2]
class_4 = training_data[training_data['Race'] == 4]
class_1 = training_data[training_data['Race'] == 1]
class_0 = training_data[training_data['Race'] == 0]

class_2_over = class_2.sample(counts[3], replace=True)
class_4_over = class_4.sample(counts[3], replace=True)
class_1_over = class_1.sample(counts[3], replace=True)
class_0_over = class_0.sample(counts[3], replace=True)

# training_data_2 = pd.concat([class_5, class_3, class_2_over, class_4_over, class_1_over, class_0_over], axis=0)
# training_data_2['Race'].value_counts()

In [None]:
class_5_over_count = race_proportions_1[5]/race_proportions_1[3] * counts[3]
print(class_5_over_count)
class_5_over = class_5.sample(int(class_5_over_count), replace=True)
# class_5 = training_data[training_data['Race'] == 5]

In [None]:
training_data_1 = pd.concat([class_5_over, class_3, class_2_over, class_4_over, class_1_over, class_0_over], axis=0)
training_data_1['Race'].value_counts()

In [None]:
# plot the count after over-sampling
# print("Total counts of classes:")
# print(training_data_1['Race'].value_counts())
training_data_1['Race'].value_counts().plot(kind='bar', title='count (target)')
plt.show()

In [None]:
X_test = testing_data.drop(['HeartDisease'], axis=1)
y_test = testing_data['HeartDisease']

X_train = training_data.drop(['HeartDisease'], axis=1)
y_train = training_data['HeartDisease']

X_train_1 = training_data_1.drop(['HeartDisease'], axis=1)
y_train_1 = training_data_1['HeartDisease']

X_train_list = [X_train, X_train_1]
y_train_list = [y_train, y_train_1]
# X_train_list = [X_train, X_train1, X_train2, X_train3, X_train4, X_train5]
# y_train_list = [y_train, y_train1, y_train2, y_train3, y_train4, y_train5]
data_desc = ['Imbalanced', 'Random Over-sampling']

### Stage 2

In [None]:
propor_2 = 1/5
race_proportions_2 = {5: propor_2, 3: propor_2, 2: propor_2, 4: propor_2, 1: propor_2, 0: propor_2}
race_proportions_2

In [None]:
# Separate class
class_3_over = class_3.sample(counts[5], replace=True)
class_2_over = class_2.sample(counts[5], replace=True)
class_4_over = class_4.sample(counts[5], replace=True)
class_1_over = class_1.sample(counts[5], replace=True)
class_0_over = class_0.sample(counts[5], replace=True)

# training_data_2 = pd.concat([class_5, class_3, class_2_over, class_4_over, class_1_over, class_0_over], axis=0)
# training_data_2['Race'].value_counts()

In [None]:
training_data_2 = pd.concat([class_5, class_3_over, class_2_over, class_4_over, class_1_over, class_0_over], axis=0)
training_data_2['Race'].value_counts()

In [None]:
# plot the count after over-sampling
print("Total counts of classes:")
print(training_data_2['Race'].value_counts())
training_data_2['Race'].value_counts().plot(kind='bar', title='count (target)')
plt.show()

In [None]:
X_train_2 = training_data_2.drop(['HeartDisease'], axis=1)
y_train_2 = training_data_2['HeartDisease']

X_train_list = [X_train, X_train_1, X_train_2]
y_train_list = [y_train, y_train_1, y_train_2]

# X_train_list = [X_train, X_train1, X_train2, X_train3, X_train4, X_train5]
# y_train_list = [y_train, y_train1, y_train2, y_train3, y_train4, y_train5]
data_desc = ['Imbalanced', 'Random Over-sampling Stage 1', 'Random Over-sampling Stage 2']

### Stage 3, Stage 4

In [None]:
race_proportions_1

In [None]:
race_proportions_2

In [None]:
race_proportions_3 = {}
race_proportions_4 = {}

for i in race_proportions.keys():
    race_proportions_3[i] = (race_proportions[i]*2 + race_proportions_2[i]) / 3
    race_proportions_4[i] = (race_proportions[i] + race_proportions_2[i]*2) / 3

In [None]:
race_proportions_3

In [None]:
race_proportions_4

In [None]:
# Separate class
class_2_over = class_2.sample(counts[3], replace=True)
class_4_over = class_4.sample(counts[3], replace=True)
class_1_over = class_1.sample(counts[3], replace=True)
class_0_over = class_0.sample(counts[3], replace=True)

In [None]:
class_5_over_count = race_proportions_3[5]/race_proportions_3[3] * counts[3]
print(class_5_over_count)
class_5_over = class_5.sample(int(class_5_over_count), replace=True)
# class_5 = training_data[training_data['Race'] == 5]

In [None]:
training_data_3 = pd.concat([class_5_over, class_3, class_2_over, class_4_over, class_1_over, class_0_over], axis=0)
training_data_3['Race'].value_counts()

In [None]:
# plot the count after over-sampling
print("Total counts of classes:")
print(training_data_3['Race'].value_counts())
training_data_3['Race'].value_counts().plot(kind='bar', title='count (target)')
plt.show()

In [None]:
X_train_3 = training_data_3.drop(['HeartDisease'], axis=1)
y_train_3 = training_data_3['HeartDisease']

X_train_list = [X_train, X_train_1, X_train_2, X_train_3]
y_train_list = [y_train, y_train_1, y_train_2, y_train_3]

data_desc = ['Imbalanced', 'Random Over-sampling Stage 1', 'Random Over-sampling Stage 2', 'Random Over-sampling Stage 3']

In [None]:
class_5_over_count = race_proportions_4[5]/race_proportions_4[3] * counts[3]
print(class_5_over_count)
class_5_over = class_5.sample(int(class_5_over_count), replace=True)
# class_5 = training_data[training_data['Race'] == 5]

In [None]:
training_data_4 = pd.concat([class_5_over, class_3, class_2_over, class_4_over, class_1_over, class_0_over], axis=0)
training_data_4['Race'].value_counts()

In [None]:
# plot the count after over-sampling
print("Total counts of classes:")
print(training_data_3['Race'].value_counts())
training_data_3['Race'].value_counts().plot(kind='bar', title='count (target)')
plt.show()

In [None]:
X_train_4 = training_data_4.drop(['HeartDisease'], axis=1)
y_train_4 = training_data_4['HeartDisease']

X_train_list = [X_train, X_train_1, X_train_2, X_train_3, X_train_4]
y_train_list = [y_train, y_train_1, y_train_2, y_train_3, y_train_4]

data_desc = ['Imbalanced', 'Random Over-sampling Stage 1', 'Random Over-sampling Stage 2', 'Random Over-sampling Stage 3', 'Random Over-sampling Stage 4']

### Plotting

In [None]:
for i in range(len(X_train_list[0: ])):
    print("---------------------------------------------------------------------------")
    print(f"Model with training data {i} ({data_desc[i]}):\n".upper())
    print (str(X_train_list[i].shape[0]) + " training examples")
    
    # making model for logistic regression
    clf_LR= LogisticRegression(random_state=0).fit(X_train_list[i], y_train_list[i])
    pred = clf_LR.predict(X_test)
    print (pred)
    
    df = pd.DataFrame({
        'race': X_test["Race"],
        'predictions': pred,
        'ground_truth': y_test
    })

    print("Classification report:\n")
    print(classification_report(y_test,pred))

    tp_rate_by_race = df.groupby('race').apply(lambda x: recall_score(x['ground_truth'], x['predictions']))
    
    plt.bar(tp_rate_by_race.index, tp_rate_by_race)
    plt.xlabel('Race')
    plt.ylabel('Accuracy')
    plt.title('Accuracy Within Each Race')
    plt.show()
    print("---------------------------------------------------------------------------")

In [None]:
for i in range(len(X_train_list)):
    print("---------------------------------------------------------------------------")
    print(f"Model with training data {i} ({data_desc[i]}):\n".upper())
    
    # making model for logistic regression
    clf_LR= LogisticRegression(random_state=0).fit(X_train_list[i], y_train_list[i])
    pred = clf_LR.predict(X_test)
    
    print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, pred)))
    print("Classification report:\n")
    print(classification_report(y_test,pred))
    
    print("Confusion Matrix:") 
    ConfusionMatrixDisplay.from_predictions(y_test, pred, cmap='YlOrRd')
    plt.show()
    print("---------------------------------------------------------------------------")