## 1. Data Loading and Preparation

In [1]:
import pandas as pd

**Data Dictionary**

This section clarifies the meaning of our columns, especially the coded features.

* **age_years**: Age of the patient (in years).
* **gender**: 1: Male, 2: Female
* **height_cm**: Height in centimeters.
* **weight_kg**: Weight in kilograms.
* **systolic_bp**: Systolic blood pressure (the "upper" number).
* **diastolic_bp**: Diastolic blood pressure (the "lower" number).
* **cholesterol**: 1: Normal, 2: Above Normal, 3: Well Above Normal
* **glucose**: 1: Normal, 2: Above Normal, 3: Well Above Normal
* **is_smoker**: 0: No, 1: Yes
* **is_alcoholic**: 0: No, 1: Yes
* **is_active**: 0: No, 1: Yes
* **target**: 0: No cardiovascular disease, 1: Presence of cardiovascular disease

In [2]:
df = pd.read_csv('../data/cardio_train.csv', sep=';')
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
# rename columns for better clarity
column_mapping = {
    'age': 'age_days',
    'height': 'height_cm',
    'weight': 'weight_kg',
    'ap_hi': 'systolic_bp',
    'ap_lo': 'diastolic_bp',
    'gluc': 'glucose',
    'smoke': 'is_smoker',
    'alco': 'is_alcoholic',
    'active': 'is_active',
    'cardio': 'target'
}

df = df.rename(columns=column_mapping)

In [4]:
# convert age from days to years
df['age_years'] = (df['age_days'] / 365.25).round().astype(int)

df = df.drop(columns=['id', 'age_days'])

In [5]:
final_columns = [
    'age_years',
    'gender',
    'height_cm',
    'weight_kg',
    'systolic_bp',
    'diastolic_bp',
    'cholesterol',
    'glucose',
    'is_smoker',
    'is_alcoholic',
    'is_active',
    'target'
]

df = df[final_columns]

In [6]:
df.head()

Unnamed: 0,age_years,gender,height_cm,weight_kg,systolic_bp,diastolic_bp,cholesterol,glucose,is_smoker,is_alcoholic,is_active,target
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,48,1,156,56.0,100,60,1,1,0,0,0,0


## 2. Exploratory Data Analysis (EDA)

In [7]:
print("Target Variable Distribution (0 = No Disease, 1 = Disease):")
target_distribution = df['target'].value_counts(normalize=True)
print(target_distribution)

Target Variable Distribution (0 = No Disease, 1 = Disease):
target
0    0.5003
1    0.4997
Name: proportion, dtype: float64


In [8]:
from sklearn.metrics import mutual_info_score

In [9]:
categorical_features = [
    'gender', 
    'cholesterol', 
    'glucose', 
    'is_smoker', 
    'is_alcoholic', 
    'is_active'
]

numerical_features = [
    'age_years', 
    'height_cm', 
    'weight_kg', 
    'systolic_bp', 
    'diastolic_bp'
]

In [10]:
# --- 1. Mutual Information (for Categorical) ---
# This tells us how much "information" a feature gives us about the target

def mutual_info_target_score(series):
    return mutual_info_score(series, df['target'])

print("--- Mutual Information (Categorical Features) ---")
# We apply this function to all categorical columns
mi_scores = df[categorical_features].apply(mutual_info_target_score)
print(mi_scores.sort_values(ascending=False))

print("\n")

# --- 2. Correlation (for Numerical) ---
# This tells us how linearly related a feature is to the target
# We use .abs() because a strong negative correlation (-0.8) is just
# as important as a strong positive one (0.8).

print("--- Correlation (Numerical Features) ---")
correlation_scores = df[numerical_features].corrwith(df['target']).abs()
print(correlation_scores.sort_values(ascending=False))

--- Mutual Information (Categorical Features) ---
cholesterol     0.025351
glucose         0.004223
is_active       0.000636
is_smoker       0.000120
gender          0.000033
is_alcoholic    0.000027
dtype: float64


--- Correlation (Numerical Features) ---
age_years       0.237802
weight_kg       0.181660
diastolic_bp    0.065719
systolic_bp     0.054475
height_cm       0.010821
dtype: float64


## 3. Validation Framework

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Split the Data ---
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [13]:
# Then, split the 80% (full_train) into 75% (train) and 25% (validation)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [14]:
# get the 'target' column (our y) from each dataframe
y_train = df_train['target'].values
y_val = df_val['target'].values
y_test = df_test['target'].values

# delete the 'target' column from the feature dataframes
df_train = df_train.reset_index(drop=True)

del df_val['target']
df_val = df_val.reset_index(drop=True)

del df_test['target']
df_test = df_test.reset_index(drop=True)


# Check the results ---
print(f"Training set size:   {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Test set size:       {len(df_test)}")

Training set size:   42000
Validation set size: 14000
Test set size:       14000


## 4. Feature Engineering (One-Hot Encoding)

In [15]:
from sklearn.feature_extraction import DictVectorizer

In [16]:
# Initialize the vectorizer
dv = DictVectorizer(sparse=False)

In [17]:
# Convert our training data (df_train) into a dictionary format
train_dicts = df_train[categorical_features + numerical_features].to_dict(orient='records')

In [18]:
# Fit and transform the training data to create the feature matrix X_train
X_train = dv.fit_transform(train_dicts)

In [19]:
# Convert our validation data (df_val) into a dictionary format
val_dicts = df_val[categorical_features + numerical_features].to_dict(orient='records')

In [20]:
# Transform the validation data using the *same* fitted vectorizer
X_val = dv.transform(val_dicts)

In [21]:
# Check the results
print(f"Original DataFrame shape (train): {df_train.shape}")
print(f"New Feature Matrix shape (X_train): {X_train.shape}")
print("\nFeature names created by DictVectorizer:")
print(dv.get_feature_names_out()[:10]) # Print first 10 feature names

Original DataFrame shape (train): (42000, 12)
New Feature Matrix shape (X_train): (42000, 11)

Feature names created by DictVectorizer:
['age_years' 'cholesterol' 'diastolic_bp' 'gender' 'glucose' 'height_cm'
 'is_active' 'is_alcoholic' 'is_smoker' 'systolic_bp']


In [22]:
X_train

array([[ 50.,   1.,  80., ...,   0., 150.,  84.],
       [ 61.,   1.,  80., ...,   1., 150.,  78.],
       [ 52.,   1.,  80., ...,   0., 120.,  65.],
       ...,
       [ 61.,   1.,  80., ...,   0., 120.,  70.],
       [ 48.,   1.,  80., ...,   0., 120.,  53.],
       [ 42.,   1.,  90., ...,   0., 130.,  80.]], shape=(42000, 11))

## 5. Model Training: Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [27]:
# initialize the StandardScaler
scaler = StandardScaler()

In [29]:
#FIT the scaler ONLY on the training data (X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [30]:
# solver='lbfgs' as it's the modern default
model = LogisticRegression(solver='lbfgs', random_state=1)

In [31]:
# Train (fit) the model on our 'X_train' feature matrix
# and our 'y_train' target vector
model.fit(X_train_scaled, y_train)
print("Model training complete!")

Model training complete!


## 6. Model evaluation (Validation set)

In [32]:
from sklearn.metrics import accuracy_score

In [34]:
# scaled validation features (X_val_scaled)
y_pred_val = model.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred_val)

print(f"--- Baseline Model (Logistic Regression) ---")
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

--- Baseline Model (Logistic Regression) ---
Validation Accuracy: 71.83%


## 7. Model training: Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(X_train_scaled, y_train)

print("Random Forest model training complete!")

Random Forest model training complete!


In [43]:
# Evaluate the Random Forest Model
y_pred_rf = rf_model.predict(X_val_scaled)
rf_accuracy = accuracy_score(y_val, y_pred_rf)

print(f"Random Forest Accuracy:     {rf_accuracy * 100:.2f}%")

Random Forest Accuracy:     70.31%


## 8. Final model selection and saving

In [None]:
import pickle
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

print("--- Starting Final Model Training ---")

# --- 1. Define Categorical and Numerical Features ---
categorical_features = [
    'gender', 'cholesterol', 'glucose', 'is_smoker', 'is_alcoholic', 'is_active'
]
numerical_features = [
    'age_years', 'height_cm', 'weight_kg', 'systolic_bp', 'diastolic_bp'
]

# --- 2. Combine Train + Validation data for final training ---
df_full_train_final = pd.concat([df_train, df_val])
y_full_train_final = pd.concat([pd.Series(y_train), pd.Series(y_val)])


# --- 3. Define Preprocessing Pipeline ---

# Helper function for ColumnTransformer
def to_dicts(df):
    return df.to_dict(orient='records')

preprocessor = ColumnTransformer(
    [
        ('num_scaler', StandardScaler(), numerical_features),
        ('cat_encoder', make_pipeline(
            FunctionTransformer(to_dicts),
            DictVectorizer(sparse=False)
        ), categorical_features)
    ],
    remainder='drop'
)

# --- 4. Create the Final End-to-End Pipeline ---
#
# Step 1: Run the preprocessor
# Step 2: Run the Logistic Regression model
final_pipeline = make_pipeline(
    preprocessor,
    LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)
)

# --- 5. Train the Final Pipeline ---
print("Training the full pipeline (this may take a moment)...")
final_pipeline.fit(df_full_train_final, y_full_train_final)
print("Pipeline training complete!")

# --- 6. Save the Pipeline to a File ---
#
output_file = 'model.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump(final_pipeline, f_out)

print(f"Final Pipeline saved to: {output_file}")

--- Starting Final Model Training ---
Training the full pipeline (this may take a moment)...


Pipeline training complete!
Final Pipeline saved to: model.bin
