In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
import pandas as pd 
import numpy as np 

In [3]:
import os 

project_dir = os.path.dirname(os.getcwd()) 
data_dir = os.path.join(project_dir, "data") 
raw_data_dir = os.path.join(data_dir, "raw") 
data_path = os.path.join(raw_data_dir, "CVD_cleaned.csv") 

df = pd.read_csv(data_path)

In [4]:
df_cleaned = df.copy()

In [5]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder 
from sklearn.linear_model import SGDClassifier 

In [6]:
num_transformer = Pipeline(
    steps=[("scale", StandardScaler())]
)

In [35]:
cat_transformer = Pipeline(
    steps=[("OHE", OneHotEncoder(sparse_output=False, drop='first'))]
)

In [8]:
# check the auto-sorted order will be correct: 

age_cats = sorted(df_cleaned['Age_Category'].unique().tolist()) 
print(age_cats)

['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']


In [9]:
age_ord_transformer = Pipeline(
    steps=[("ordinal_encode", OrdinalEncoder())]
)

In [10]:
df_cleaned['General_Health'].unique()

array(['Poor', 'Very Good', 'Good', 'Fair', 'Excellent'], dtype=object)

In [42]:
health_ord_transformer = Pipeline(
    steps=[
            ("ordinal_encode", OrdinalEncoder(categories=[[
                'Poor', 'Fair', 'Good', 'Very Good', 'Excellent'
            ]]
            ))
        ]
)

In [34]:
print(df_cleaned['Checkup'].unique()) 
print(df_cleaned['Checkup'].nunique())

['Within the past 2 years' 'Within the past year' '5 or more years ago'
 'Within the past 5 years' 'Never']
5


In [None]:
checkup_order = [
    'Never', 
    '5 or more years ago', 
    'Within the past 5 years',
    'Within the past 2 years', 
    'Within the past year'
]

In [41]:
checkup_ord_transformer = Pipeline(
    steps=[("ordinal_encode", OrdinalEncoder(categories=[checkup_order]))]
)

In [14]:
num_cols = df_cleaned.dtypes[df_cleaned.dtypes == 'float64'].index.tolist() 
print(num_cols)

['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']


## [3.8] Create a list called `cat_cols` containing columns that are categorical

In [15]:
obj_cols = df_cleaned.select_dtypes(include='object').columns.tolist() 
cat_cols = [
    col for col in obj_cols if col not in ['Age_Category', 'General_Health', 'Checkup', 'Heart_Disease']
]
print(cat_cols)

['Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex', 'Smoking_History']


In [16]:
df_cleaned[cat_cols].head()

Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Smoking_History
0,No,No,No,No,No,Yes,Female,Yes
1,No,No,No,No,Yes,No,Female,No
2,Yes,No,No,No,Yes,No,Female,No
3,Yes,No,No,No,Yes,No,Male,No
4,No,No,No,No,No,No,Male,Yes


In [17]:
for col in cat_cols: 
    if df_cleaned[col].nunique() > 2: 
        print(f"{col} has more than 2 unique values") 

Diabetes has more than 2 unique values


In [18]:
df_cleaned['Diabetes'].value_counts()

Diabetes
No                                            259141
Yes                                            40171
No, pre-diabetes or borderline diabetes         6896
Yes, but female told only during pregnancy      2646
Name: count, dtype: int64

Diabetes should be treated as ordinal too: 

{
    'No': 0, 
    'No, pre-diabetes or borderline diabetes': 1, 
    'Yes, but female told only during pregnancy': 2, 
    'Yes': 3 
}

In [40]:
diabetes_order = [
    'No', 
    'No, pre-diabetes or borderline diabetes',
    'Yes, but female told only during pregnancy', 
    'Yes'
]
diabetes_ord_transformer = Pipeline(
    steps=[("transform", OrdinalEncoder(categories=[diabetes_order]))]
)

In [20]:
binary_cols = [
    col for col in cat_cols if col != 'Diabetes'
] 
print(binary_cols)

['Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis', 'Sex', 'Smoking_History']


In [21]:
for col in binary_cols: 
    if df_cleaned[col].nunique() > 2: 
        print(f"{col} has more than 2 unique values")

In [36]:
print(binary_cols)

['Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis', 'Sex', 'Smoking_History']


In [43]:
from sklearn.compose import ColumnTransformer 

preprocessor = ColumnTransformer(
    transformers=[
        ("num_cols", num_transformer, num_cols),
        ("binary_cols", cat_transformer, binary_cols), 
        ("age_col", age_ord_transformer, ['Age_Category']),
        ("health_col", health_ord_transformer, ['General_Health']), 
        ("checkup_col", checkup_ord_transformer, ['Checkup']),
        ("diabetes_col", diabetes_ord_transformer, ['Diabetes'])
        ]
)

<i> Note: pretty sure OrdinalEncoder is more flexible than making you do a separate encoder for each column like this </i>

In [44]:
sgd_clf = SGDClassifier(
    loss='log_loss', 
    penalty='elasticnet', 
    random_state=42
)

sgd_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("sgd", sgd_clf)
])

In [26]:
df_cleaned.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [48]:
y = (df_cleaned['Heart_Disease'] == 'Yes').astype(int)
X = df_cleaned.drop('Heart_Disease', axis=1)

sgd_pipe.fit(X, y)

In [49]:
preds = sgd_pipe.predict(X)

In [51]:
obs = pd.DataFrame(X.iloc[0]).transpose() 
obs

Unnamed: 0,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0


In [52]:
sgd_pipe.predict(obs)

array([0])

In [53]:
from joblib import dump 

with open(os.path.join(project_dir, "models", "sgd_pipe.joblib"), 'wb') as f: 
    dump(sgd_pipe, f)