In [2]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


## Loading and Preprocessing Heart Attack Data

Load the `heart_attack_prediction_dataset.csv` in a pandas DataFrame called `df_heart`

In [4]:
# Loading data
file_path = Path("../Resources/heart_attack_prediction_dataset.csv")
df_heart = pd.read_csv(file_path)
df_heart.head()


Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


Create the target vector by assigning the values of the `Default` column from the `df_heart` DataFrame.

In [8]:
# Define target vector
y = df_heart.values.reshape(-1, 1)
y[:5]


array([['BMW7812'],
       [67],
       ['Male'],
       [208],
       ['158/88']], dtype=object)

Split the data into training and testing sets.

In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

ValueError: Found input variables with inconsistent numbers of samples: [8763, 227838]

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [7]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [8]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [9]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

Once the data is scaled, create a random forest instance and train it with the training data (`X_train_scaled` and `y_train`), define `n_estimators=500` and `random_state=78`.

In [10]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [11]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

## Making Predictions Using the Random Forest Model

Validate the trained model by predicting loan defaults using the testing data (`X_test_scaled`).

In [12]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [13]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,330,14
Actual 1,36,145


Accuracy Score : 0.9047619047619048
Classification Report
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       344
           1       0.91      0.80      0.85       181

    accuracy                           0.90       525
   macro avg       0.91      0.88      0.89       525
weighted avg       0.91      0.90      0.90       525



## Feature Importance

In this section, you are asked to fetch the features' importance from the random forest model and display the top 10 most important features.

In [15]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.32525509355569393, 'Term'),
 (0.08731610602488035, 'Year'),
 (0.07749126212056351, 'Amount'),
 (0.04154318603507449, 'Zip'),
 (0.0359707771436633, 'NoEmp'),
 (0.030281829481746146, 'RealEstate'),
 (0.028696674498168147, 'Month'),
 (0.024032270489503894, 'CreateJob'),
 (0.020296840586558236, 'RevLineCr'),
 (0.017578173845109207, 'Bank_BANK OF AMERICA NATL ASSOC')]

## Analysis Questions

Finally, analyse the model's evaluation results and answer the following questions.

* **Question 1:** Would you trust this model to detect if a loan will default? 

 * **Sample Answer:** Yes. The model's accuracy is good a predicting if a loan will default because of the high accuracy and F-1 and recall scores. 


* **Question 2:** What are your insights about the top 10 most important features?

 * **Sample Answer:** It seems that the "Bank" is not relevant for the model, so we can create a new random forest model by only taking the top 5 to 10 features. Also, for piloting this model in a business environment, we will only need to fetch new data about these features.