In [None]:
## add necessary imports
## import library, upload .csv file

import pandas as pd
import io

heart_data_main = '/content/AISummative_FinalHeartDiseaseDataset_27042022.csv'  ## saved on the files section of the worksheet
heart_data = pd.read_csv(heart_data_main)

## Step 1: Specifying the prediction target (the maximum heart rate)

columns = heart_data.columns
print(columns)


y = heart_data.MaxHR
print(y)

## Step 2: Create X (the predict features aka the columns we're using to predict maximum heart rate, y)

feature_names = ["Age", "RestingBP", "Cholesterol", "SexNum", "FastingBS", "NormalRECG"]   # NormalRECG is for the normal resting ECG, which can be logged
# all other prediction features are features that can be pre-logged into the wearable


X = heart_data[feature_names]  # the DataFrame with the specific columns we're going to use
print(X)

## Step 3: Specify and fit model
from sklearn.tree import DecisionTreeRegressor

heart_model = DecisionTreeRegressor(random_state=1)

heart_model.fit(X, y)

## Step 4: Make predictions

predictions = heart_model.predict(X.head(10))
print(predictions)

## Step 5: Evaluate results

print("Original prices")
print(y.head())
print("\n")
print("Predicted prices")
print(predictions)


## Observation: the prices are exactly the same!

Index(['Age', 'Sex', 'SexNum', 'ChestPainType', 'RestingBP', 'Cholesterol',
       'FastingBS', 'RestingECG', 'NormalRECG', 'STRECG', 'MaxHR',
       'ExerciseAngina', 'Oldpeak', 'STSlope', 'HeartDisease'],
      dtype='object')
0      172
1      156
2       98
3      108
4      122
      ... 
913    132
914    141
915    115
916    174
917    173
Name: MaxHR, Length: 918, dtype: int64
     Age  RestingBP  Cholesterol  SexNum  FastingBS  NormalRECG
0     40        140          289       1          0           1
1     49        160          180       0          0           1
2     37        130          283       1          0           0
3     48        138          214       0          0           1
4     54        150          195       1          0           1
..   ...        ...          ...     ...        ...         ...
913   45        110          264       1          0           1
914   68        144          193       1          1           1
915   57        130          131   

In [None]:


## Step 1: Split the data (into testing & training data)

from sklearn.model_selection import train_test_split

# splitting into training and validating data for both X and y
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [None]:
## Step 2: Specify and fit the model

# specifying decision tree regressor model
heart_model = DecisionTreeRegressor(random_state=1)

# fitting the model with training data
heart_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

In [None]:
## Step 3: Make predictions with the validation data

val_predictions = heart_model.predict(val_X)
val_predictions_head = heart_model.predict(val_X.head(10))

print("Validation predictions")
print(val_predictions_head)
print("\n")

print("Training data predictions")
print(predictions)
print("\n")

print("Validation Data (val_X)")
print(val_X.head(10))
print("\n")

print("Training Data (train_X)")
print(train_X.head(10))
print("\n")


# As we can see, the predictions are different from each other.

# Validation predictions differ from training predictions because the data used for validation predictions is data that HAS NOT
# been used to fit and train the model, so the predictions are more accurate and unbiased, because they will not be 100% the 
# same as the original data because they weren't used to fit the model

Validation predictions
[160. 162. 125. 120. 154. 165. 154. 105. 142.  84.]


Training data predictions
[172. 156.  98. 108. 122. 170. 170. 142. 130. 120.]


Validation Data (val_X)
     Age  RestingBP  Cholesterol  SexNum  FastingBS  NormalRECG
900   58        114          318       1          0           0
570   56        128          223       1          0           0
791   51        140          298       1          0           1
189   53        180          285       1          0           0
372   63        185            0       1          0           1
191   50        170          209       1          0           0
643   58        112          230       1          0           0
474   62        131            0       1          0           1
65    37        120          260       0          0           1
890   64        170          227       1          0           0


Training Data (train_X)
     Age  RestingBP  Cholesterol  SexNum  FastingBS  NormalRECG
41    54        130      

In [None]:
## Step 4: Calculate the Mean Absolute Error in Validation Data

from sklearn.metrics import mean_absolute_error

# validation MAE
val_mae = mean_absolute_error(val_y, val_predictions)
print("Validation MAE: ")
print(val_mae)
print("\n")

# Calculating Mean Absolute Percentage error
from sklearn.metrics import mean_absolute_percentage_error

print("MAE percentage error: ")
print(mean_absolute_percentage_error(val_y, val_predictions))
print("\n")

Validation MAE: 
26.621739130434783


MAE percentage error: 
0.21299503271677955


