In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Clean/preprocess the data

In [11]:
df = pd.read_csv("Framingham Dataset.csv")

# first we drop columns with more than 50% missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
columns_to_drop = missing_percentage[missing_percentage > 50].index.tolist()
cleaned_df = df.drop(columns=columns_to_drop)

# next we replace numerical columns with the median
numerical_columns = ['TOTCHOL', 'GLUCOSE', 'BMI']
for col in numerical_columns:
    median_value = cleaned_df[col].median()
    cleaned_df[col] = cleaned_df[col].fillna(median_value)

# we replace binary/categorical columns with the mode
categorical_columns = ['BPMEDS']
for col in categorical_columns:
    mode_value = cleaned_df[col].mode()[0]
    cleaned_df[col] = cleaned_df[col].fillna(mode_value)

# modify CIGPDAY (number of cigarettes smoked per day)
condition = (cleaned_df['CIGPDAY'].isnull()) & (cleaned_df['CURSMOKE'] == 0)
cleaned_df.loc[condition, 'CIGPDAY'] = 0

median_cigpday = cleaned_df['CIGPDAY'].median()
cleaned_df['CIGPDAY'] = cleaned_df['CIGPDAY'].fillna(median_cigpday)


# we drop the 'educ' column because it it mostly a socioeconomic feature with low predictive power
# won't affect our outcome too much
cleaned_df.drop(columns=['educ'], inplace=True)

# drop rows with missing HEARTRTE (rather just drop these rows than computing the median, because its only 6 rows)
cleaned_df.dropna(subset=['HEARTRTE'], inplace=True)

print(f"Final shape of dataset: {cleaned_df.shape}")
print(f"Total missing values: {cleaned_df.isnull().sum().sum()}")


Final shape of dataset: (11621, 36)
Total missing values: 0


In [12]:
print(cleaned_df.head(5))

   RANDID  SEX  TOTCHOL  AGE  SYSBP  DIABP  CURSMOKE  CIGPDAY    BMI  \
0    2448    1    195.0   39  106.0   70.0         0      0.0  26.97   
1    2448    1    209.0   52  121.0   66.0         0      0.0  25.48   
2    6238    2    250.0   46  121.0   81.0         0      0.0  28.73   
3    6238    2    260.0   52  105.0   69.5         0      0.0  29.43   
4    6238    2    237.0   58  108.0   66.0         0      0.0  28.50   

   DIABETES  ...  CVD  HYPERTEN  TIMEAP  TIMEMI  TIMEMIFC  TIMECHD  TIMESTRK  \
0         0  ...    1         0    8766    6438      6438     6438      8766   
1         0  ...    1         0    8766    6438      6438     6438      8766   
2         0  ...    0         0    8766    8766      8766     8766      8766   
3         0  ...    0         0    8766    8766      8766     8766      8766   
4         0  ...    0         0    8766    8766      8766     8766      8766   

   TIMECVD  TIMEDTH  TIMEHYP  
0     6438     8766     8766  
1     6438     8766     

In [13]:
# now we drop more columns that give outcome leakage; i.e. columns that give the model future predictions that 
# we wouldn't realistically have at the time of an actual prediction, such as whether a patient died during the study
# also drop additional irrelevant columns
target_col = 'ANYCHD'

cols_to_drop = [
    'RANDID',  # ID column
    'TIME', 'PERIOD',  # timing variables
    'DEATH', 'STROKE', 'CVD', 'HYPERTEN', 'MI_FCHD', 'HOSPMI', 'ANGINA',  # future outcomes
    'TIMEAP', 'TIMEMI', 'TIMEMIFC', 'TIMECHD', 'TIMESTRK', 'TIMECVD', 'TIMEDTH', 'TIMEHYP'  # event times
]

cleaned_df_final = cleaned_df.drop(columns=cols_to_drop)

In [14]:
X = cleaned_df_final.drop(columns=[target_col])
y = cleaned_df_final[target_col]

# split into train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test labels shape: {y_test.shape}")

Training set shape: (8134, 17)
Test set shape: (3487, 17)
Training labels shape: (8134,)
Test labels shape: (3487,)


## Build the MLP