# Decision Tree

## Imports
In this section we import the libraries necessary to analyse our data.

In [None]:
import matplotlib
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

## Import & Pre-process Data
https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset/data

In [58]:
# Read in data
df=pd.read_csv("cardio_train.csv", sep=";")

# Drop unnecessary columns
df = df.dropna()
df = df.drop('id', axis=1)
df = df.drop('weight', axis=1)
df = df.drop('height', axis=1)

# Get average blood pressure using high and low values
df['bp_mean'] = df[['ap_hi', 'ap_lo']].mean(axis=1)

# Categorize average blood pressure for classification
def categorize_bp(mean_bp):
    if mean_bp > 120:
        return 'High'
    elif mean_bp >= 80:
        return 'Normal'
    else:
        return 'Low'

# Apply the categorization function
df['bp_category'] = df['bp_mean'].apply(categorize_bp)

encoder = LabelEncoder()
df['bp_category_encoded'] = encoder.fit_transform(df['bp_category'])

# Convert ages to years
df['ages_year'] = df['age'] // 365

# Categorize ages into age groups
def categorize_age(ages_year):
    if ages_year < 18:
        return 'Under 18'
    elif ages_year >= 18 and ages_year <= 21:
        return '18-21'
    elif ages_year > 21 and ages_year <= 40:
        return '21-40'
    elif ages_year > 40 and ages_year <= 60:
        return '41-60'
    elif ages_year > 60 and ages_year <= 80:
        return '61-80'
    else:
        return '80+'

# Apply categorization 
df['age_category'] = df['ages_year'].apply(categorize_age)

# Encode ages
df['age_category_encoded'] = encoder.fit_transform(df['age_category'])

df.head()

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bp_mean,bp_category,bp_category_encoded,ages_year,age_category,age_category_encoded
0,18393,2,110,80,1,1,0,0,1,0,95.0,Normal,2,50,41-60,1
1,20228,1,140,90,3,1,0,0,1,1,115.0,Normal,2,55,41-60,1
2,18857,1,130,70,3,1,0,0,0,1,100.0,Normal,2,51,41-60,1
3,17623,2,150,100,1,1,0,0,1,1,125.0,High,0,48,41-60,1
4,17474,1,100,60,1,1,0,0,0,0,80.0,Normal,2,47,41-60,1


## Fit Data

In [None]:
# Split data into training and testing
X = df[['gender', 'age_category_encoded', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bp_category_encoded']]
Y = df['cardio']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
tree = DecisionTreeClassifier()

tree.fit(X_train, Y_train)

Y_pred = tree.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.65


## Display Data