In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [6]:
df=pd.read_csv('../data/data2020.csv')
df.head()

columns_to_keep = [
    'HeartDisease','BMI', 'Smoking', 'Stroke', 'PhysicalHealth',
    'DiffWalking', 'Sex', 'AgeCategory', 'Diabetic',
    'Asthma', 'KidneyDisease'
]

df = df[columns_to_keep]


In [7]:
df1 = df.copy()
le=LabelEncoder()
categorical = df.select_dtypes(include = 'O')
categorical.columns
for feature in categorical:
    df1[feature]=le.fit_transform(df1[feature])
    print(feature)

HeartDisease
Smoking
Stroke
DiffWalking
Sex
AgeCategory
Diabetic
Asthma
KidneyDisease


In [8]:
def split_data(X, Y):
    # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2,stratify=Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2,stratify=df1['HeartDisease'])
    return X_train, X_test, y_train, y_test
def over_sample_date(X_train, y_train, data_rate = 1):
    minority_class = y_train.value_counts().idxmin()    # 1
    majority_class = y_train.value_counts().idxmax()    # 0

    minority_data = X_train[y_train == minority_class]
    majority_data = X_train[y_train == majority_class]
    minority_labels = y_train[y_train == minority_class]
    majority_labels = y_train[y_train == majority_class]

    oversampled_minority_data = minority_data.sample(int(len(majority_data)*data_rate), replace=True)
    oversampled_minority_labels = minority_labels.sample(int(len(majority_data)*data_rate), replace=True)
    X_train_oversampled = pd.concat([majority_data, oversampled_minority_data])
    y_train_oversampled = pd.concat([majority_labels, oversampled_minority_labels])
    return X_train_oversampled, y_train_oversampled

## Change data_rate and other things to improve the data

In [9]:
# get the training dataset and test dataset
X=df1.drop('HeartDisease',axis=1)
Y=df1['HeartDisease']
X_train, X_test, y_train, y_test = split_data(X,Y)
X_train_oversampled, Y_train_oversampled = over_sample_date(X_train, y_train, data_rate = 0.4)

In [10]:
# transform the training data
scaler = StandardScaler()
X_train_oversampled_T = scaler.fit_transform(X_train_oversampled)
X_test_T = scaler.transform(X_test)

In [11]:
log_reg = LogisticRegression()
log_reg.fit(X_train_oversampled_T, Y_train_oversampled)

y_pred = log_reg.predict(X_test_T)
#acc=log_reg.score(y_test,y_pred)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93     58484
           1       0.31      0.46      0.37      5475

    accuracy                           0.87     63959
   macro avg       0.63      0.68      0.65     63959
weighted avg       0.89      0.87      0.88     63959



array([[52970,  5514],
       [ 2956,  2519]])

In [12]:
log_reg.coef_,log_reg.intercept_

(array([[0.09889642, 0.22637016, 0.30912101, 0.2488255 , 0.18334112,
         0.37937364, 1.03940385, 0.25440507, 0.13933909, 0.17579232]]),
 array([-1.30315598]))

In [15]:
from sklearn.preprocessing import StandardScaler
import pickle
def normalize_row(row, scaler):
    # Reshape row to match expected input of scaler
    row_reshaped = row.values.reshape(1, -1)
    # Apply the same transformation as training data
    normalized_row = scaler.transform(row_reshaped)
    return normalized_row  # Flatten to match the shape of the original row

# Fit scaler
scaler = StandardScaler().fit(X_train_oversampled.values)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
df_new = X_train_oversampled.iloc[0]

df_new_normalized = normalize_row(df_new, scaler)

print(df_new_normalized)

# If true, this proof the correctness of normalization
print(X_train_oversampled_T[0] == df_new_normalized)


[[-0.30036189  1.10526784 -0.26307841 -0.48092632 -0.48201241  0.99725274
   0.81917094 -0.49229319 -0.41130464 -0.24539354]]
[[ True  True  True  True  True  True  True  True  True  True]]


In [16]:
print(list(X_train_oversampled.iloc[0]))

[26.63, 1.0, 0.0, 0.0, 0.0, 1.0, 10.0, 0.0, 0.0, 0.0]


In [19]:
# Use user input to get 
import numpy as np

user_input = [29.76, 0.0, 0.0, 0.0, 0.0, 1.0, 7.0, 0.0, 0.0, 0.0]
with open('scaler.pkl', 'rb') as f:
    scaler2 = pickle.load(f)
user_input_T = normalize_row(pd.DataFrame([user_input]),scaler2)

coef = np.array([0.09889642, 0.22637016, 0.30912101, 0.2488255 , 0.18334112,
         0.37937364, 1.03940385, 0.25440507, 0.13933909, 0.17579232]).flatten()
intercept = -1.29850855

# Compute the linear combination of inputs and weights
z = np.dot(user_input_T, coef) + intercept

# Apply the logistic function
p = 1 / (1 + np.exp(-z))

print("The predicted probability is:", p)


The predicted probability is: [0.16067047]


In [158]:
col = (list(df.columns[1:]))
coef = (list(log_reg.coef_[0]))
print(len(col), len(coef))
num = 0;
for i in range(len(col)):
    if(coef[i] > 0 or coef[i] < 0):
        print(col[i], coef[i])
        num+=1
print(num)

10 10
BMI 0.10047822315025126
Smoking 0.21347318488944275
Stroke 0.3164181650611676
PhysicalHealth 0.24013116255913974
DiffWalking 0.19651085250814868
Sex 0.3685282125874082
AgeCategory 1.0367771512672388
Diabetic 0.247550756220331
Asthma 0.1321086802246646
KidneyDisease 0.17420223782225586
10
