In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Data Preparation
Farmingham risk function will be used to identify the risk likelihood.  
Farmingham risk needs the following parameters: age, sex, blood pressure, currently taking blood pressure meds, total cholestrol levels, hdl cholestrol levels, diabetic and smoker.  
These columns will be kept and the rest dropped.

In [2]:
df_merged = pd.read_csv('../data/df_merged.csv')
df = df_merged.copy()

In [3]:
# Rename the columns to be used
df = df.rename(columns=
               {
                   'RIDAGEYR' : 'age',
                   'RIAGENDR' : 'sex',
                   'LBXTC' : 'total_chol',
                   'LBDHDD' : 'hdl_chol',
                   'BPQ050A' : 'bp_treated',
                   'DIQ010' : 'diabetes',
                   'SMQ020' : 'ever_smoked',
                   'SMQ040' : 'current_smoke'
               })

In [4]:
# Calculate mean blood pressure (across 3 readings)
df['sbp'] = df[['BPXSY1', 'BPXSY2', 'BPXSY3']].mean(axis=1)

# Convert sex to string
df['sex'] = df['sex'].map({1: 'M', 2: 'F'})

# Convert bp_treated, diabetes to binary
df['bp_treated'] = df['bp_treated'].map({1: 1, 2: 0})
df['diabetes'] = df['diabetes'].map({1: 1, 2: 0})

# Define smoker: 1 if current smoker
df['smoker'] = df['current_smoke'].map({1: 1, 2: 0, 7: 0, 9: 0})

In [5]:
# Final columns to be used
columns_needed = [
    'SEQN', 'age', 'sex', 'total_chol', 'hdl_chol',
    'sbp', 'bp_treated', 'smoker', 'diabetes'
]

df = df[columns_needed]
df.head()

Unnamed: 0,SEQN,age,sex,total_chol,hdl_chol,sbp,bp_treated,smoker,diabetes
0,93708.0,66.0,F,209.0,88.0,141.0,1.0,,
1,93711.0,56.0,M,238.0,72.0,101.333333,,,0.0
2,93717.0,22.0,M,213.0,53.0,118.666667,,1.0,0.0
3,93718.0,45.0,M,152.0,63.0,131.333333,1.0,,0.0
4,93721.0,60.0,F,122.0,45.0,136.0,,,0.0


In [6]:
# Check for number of missing values
df.isnull().sum()

SEQN             0
age              0
sex              0
total_chol     167
hdl_chol       167
sbp            148
bp_treated    1889
smoker        2308
diabetes        88
dtype: int64

In [7]:
df.loc[df['diabetes'].isin([7,9]), 'diabetes'] = np.nan #Clean up some stuff I forgot earlier

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2783 entries, 0 to 2782
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEQN        2783 non-null   float64
 1   age         2783 non-null   float64
 2   sex         2783 non-null   object 
 3   total_chol  2616 non-null   float64
 4   hdl_chol    2616 non-null   float64
 5   sbp         2635 non-null   float64
 6   bp_treated  894 non-null    float64
 7   smoker      475 non-null    float64
 8   diabetes    2695 non-null   float64
dtypes: float64(8), object(1)
memory usage: 195.8+ KB


In [10]:
# Very few missing values in toal_chol,hdl_chol, sbp, diabetes columns
# bp_treated too critical to impute
# Just drop the missing values
print(f"Original rows: {df.shape[0]}")
df = df.dropna(subset=[
    'age', 'sex', 'total_chol', 'hdl_chol',
    'sbp', 'bp_treated', 'diabetes'
])
print(f"Rows after cleaning: {df.shape[0]}")

Original rows: 761
Rows after cleaning: 761


In [11]:
#Losts of missing values for smoker. Impute using the mode.
#Before imputation
print("Missing before:")
print(df.isnull().mean() * 100)
# Imputation using mode
df['smoker'] = df['smoker'].fillna(df['smoker'].mode()[0])
# After Imputation
print("Missing after:")
print(df.isnull().mean() * 100)

Missing before:
SEQN           0.000000
age            0.000000
sex            0.000000
total_chol     0.000000
hdl_chol       0.000000
sbp            0.000000
bp_treated     0.000000
smoker        83.837057
diabetes       0.000000
dtype: float64
Missing after:
SEQN          0.0
age           0.0
sex           0.0
total_chol    0.0
hdl_chol      0.0
sbp           0.0
bp_treated    0.0
smoker        0.0
diabetes      0.0
dtype: float64


In [13]:
df = df.reset_index(drop=True)

### Farmingham Risk

In [14]:
def framingham_risk_score(age, total_chol, hdl_chol, sbp, is_treated_bp, smoker, diabetes, sex):
    """
    Compute Framingham 10-year risk of CVD.
    
    Parameters:
    - age: years
    - total_chol: mg/dL
    - hdl_chol: mg/dL
    - sbp: systolic BP (mm Hg)
    - is_treated_bp: 1 if on BP meds, else 0
    - smoker: 1 if current smoker, else 0
    - diabetes: 1 if diabetic, else 0
    - sex: 'M' or 'F'

    Returns:
    - 10-year CVD risk (probability between 0 and 1)
    """
    
    if sex == 'M':
        # Coefficients for Men
        ln_age = np.log(age)
        ln_total_chol = np.log(total_chol)
        ln_hdl_chol = np.log(hdl_chol)
        ln_sbp = np.log(sbp)
        
        coeffs = {
            'age': 3.06117,
            'total_chol': 1.12370,
            'hdl_chol': -0.93263,
            'sbp_treated': 1.99881,
            'sbp_untreated': 1.93303,
            'smoker': 0.65451,
            'diabetes': 0.57367,
            'mean': 23.9802,
            'baseline_survival': 0.88936
        }
        
    else:
        # Coefficients for Women
        ln_age = np.log(age)
        ln_total_chol = np.log(total_chol)
        ln_hdl_chol = np.log(hdl_chol)
        ln_sbp = np.log(sbp)
        
        coeffs = {
            'age': 2.32888,
            'total_chol': 1.20904,
            'hdl_chol': -0.70833,
            'sbp_treated': 2.82263,
            'sbp_untreated': 2.76157,
            'smoker': 0.52873,
            'diabetes': 0.69154,
            'mean': 26.1931,
            'baseline_survival': 0.95012
        }
    
    sbp_coeff = coeffs['sbp_treated'] if is_treated_bp else coeffs['sbp_untreated']
    
    # Linear combination of risk factors
    risk_score = (
        coeffs['age'] * ln_age
        + coeffs['total_chol'] * ln_total_chol
        + coeffs['hdl_chol'] * ln_hdl_chol
        + sbp_coeff * ln_sbp
        + coeffs['smoker'] * smoker
        + coeffs['diabetes'] * diabetes
    )
    
    # Compute 10-year risk
    risk = 1 - coeffs['baseline_survival'] ** np.exp(risk_score - coeffs['mean'])
    return risk


In [15]:
df['framingham_risk'] = df.apply(
    lambda row: framingham_risk_score(
        row['age'],
        row['total_chol'],
        row['hdl_chol'],
        row['sbp'],
        row['bp_treated'],
        row['smoker'],
        row['diabetes'],
        row['sex']
    ),
    axis=1
)

df[['SEQN', 'framingham_risk']].head()

Unnamed: 0,SEQN,framingham_risk
0,93718.0,0.096741
1,93735.0,0.158174
2,93742.0,0.812324
3,93758.0,0.68219
4,93759.0,0.244564


In [16]:
# Define high risk > 20% 10-year risk
df['high_risk'] = (df['framingham_risk'] > 0.20).astype(int)

df[['framingham_risk', 'high_risk']].head()

Unnamed: 0,framingham_risk,high_risk
0,0.096741,0
1,0.158174,0
2,0.812324,1
3,0.68219,1
4,0.244564,1


In [17]:
df.to_csv('..\data\df.csv', index=False)