## Model Building & Evaluation Notebook for Random Forest Classifier Approach to Dimensionality Reduction, with Stratified KFold

In [1]:
# This notebook focuses on the random forest model with 5-fold cross-validation, 
# omitting the other three models. 
# The objective of this analysis was to evaluate whether incorporating stratified
# k-fold cross-validation enhances the model's performance

In [2]:
# Import the libraries needed

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import time
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB

In [3]:
# Import data from previous notebook

X_train = pd.read_csv('Mod_3_X_train_Tree.csv')
X_test = pd.read_csv('Mod_3_X_test_Tree.csv')
Y_train = pd.read_csv('Mod_3_Y_train.csv')
Y_test = pd.read_csv('Mod_3_Y_test.csv')

In [4]:
# Check data
print('Y_train Information\n')
Y_train.info()
print('\n\nY_test Information\n')
Y_test.info()

Y_train Information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125919 entries, 0 to 125918
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   readmitted  125919 non-null  object
dtypes: object(1)
memory usage: 983.9+ KB


Y_test Information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19869 entries, 0 to 19868
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   readmitted  19869 non-null  object
dtypes: object(1)
memory usage: 155.4+ KB


In [5]:
# Change Y data to categorical
Y_train = Y_train.astype('category')
Y_test = Y_test.astype('category')

In [6]:
# Verify changes were made correctly
print('Y_train Information\n')
Y_train.info()
print('\n\nY_test Information\n')
Y_test.info()

Y_train Information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125919 entries, 0 to 125918
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   readmitted  125919 non-null  category
dtypes: category(1)
memory usage: 123.2 KB


Y_test Information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19869 entries, 0 to 19868
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   readmitted  19869 non-null  category
dtypes: category(1)
memory usage: 19.7 KB


## Random Forest Classifier

In [7]:

# Run KNN model with K-fold cross validation

# Start timer
start_time = time.time()

# Instantiate the Random Forest Classifier with 100 trees
classifier = RandomForestClassifier(n_estimators = 100, random_state = 53)

# Set up inputs for Stratified KFold cross validator
splits = 5            # number of splits
repeats = 3           # number of repeats

# Set up the tool
rskf = RepeatedStratifiedKFold(n_splits = splits, n_repeats = repeats, random_state = 53)

# Initialize a list to store accuracy scores for each fold
accuracy_scores = []

# Use the classifier to make predication with repeated stratified k-fold cross validation
for train_index, test_index in rskf.split(X_train, Y_train):
    # Subset the training and testing sets for each iteration
    X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
    Y_fold_train, Y_fold_test = Y_train.iloc[train_index], Y_train.iloc[test_index]

    # Train the classifier
    # ravel() converts Y_fold_train to a 1D array
    classifier.fit(X_fold_train, Y_fold_train.values.ravel())
    
    # Use classifier to make predictions on the test subset
    Y_pred = classifier.predict(X_fold_test)
    
    # Check the accuracy score
    accuracy = accuracy_score(Y_fold_test, Y_pred)
    
    # Add the accuracy score for this fold to the list
    accuracy_scores.append(accuracy)
    
    print(f'Accuracy: {accuracy:0.4f}')

# Turn off timer
end_time = time.time()
elapsed_time = end_time - start_time

print(f'\nTime taken: {elapsed_time} seconds\n')


Accuracy: 0.6859
Accuracy: 0.6864
Accuracy: 0.6861
Accuracy: 0.6838
Accuracy: 0.6855
Accuracy: 0.6836
Accuracy: 0.6888
Accuracy: 0.6824
Accuracy: 0.6847
Accuracy: 0.6862
Accuracy: 0.6811
Accuracy: 0.6888
Accuracy: 0.6874
Accuracy: 0.6880
Accuracy: 0.6777

Time taken: 312.84035062789917 seconds



In [8]:
# Evaluate the model on the testing set
Y_pred_test_RF = classifier.predict(X_test)
accuracy_test = accuracy_score(Y_test, Y_pred_test_RF)
print(f'\nAccuracy on Test Set: {accuracy_test:.4f}')


Accuracy on Test Set: 0.5351


In [9]:
# CHANGING THE NUMBER OF TREES IN THE FOREST
'''
# Run KNN model with K-fold cross validation

# Start timer
start_time = time.time()

# Instantiate the Random Forest Classifier with 10 trees
classifier = RandomForestClassifier(n_estimators = 10, random_state = 53)

# Set up inputs for Stratified KFold cross validator
splits = 5            # number of splits
repeats = 3           # number of repeats

# Set up the tool
rskf = RepeatedStratifiedKFold(n_splits = splits, n_repeats = repeats, random_state = 53)

# Initialize a list to store accuracy scores for each fold
accuracy_scores = []

# Use the classifier to make predication with repeated stratified k-fold cross validation
for train_index, test_index in rskf.split(X_train, Y_train):
    # Subset the training and testing sets for each iteration
    X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
    Y_fold_train, Y_fold_test = Y_train.iloc[train_index], Y_train.iloc[test_index]

    # Train the classifier
    # ravel() converts Y_fold_train to a 1D array
    classifier.fit(X_fold_train, Y_fold_train.values.ravel())
    
    # Use classifier to make predictions on the test subset
    Y_pred = classifier.predict(X_fold_test)
    
    # Check the accuracy score
    accuracy = accuracy_score(Y_fold_test, Y_pred)
    
    # Add the accuracy score for this fold to the list
    accuracy_scores.append(accuracy)
    
    print(f'Accuracy: {accuracy:0.4f}')

# Turn off timer
end_time = time.time()
elapsed_time = end_time - start_time

print(f'\nTime taken: {elapsed_time} seconds\n')

# Evaluate the model on the testing set
Y_pred_test_RF = classifier.predict(X_test)
accuracy_test = accuracy_score(Y_test, Y_pred_test_RF)
print(f'\nAccuracy on Test Set: {accuracy_test:.4f}')
'''

"\n# Run KNN model with K-fold cross validation\n\n# Start timer\nstart_time = time.time()\n\n# Instantiate the Random Forest Classifier with 10 trees\nclassifier = RandomForestClassifier(n_estimators = 10, random_state = 53)\n\n# Set up inputs for Stratified KFold cross validator\nsplits = 5            # number of splits\nrepeats = 3           # number of repeats\n\n# Set up the tool\nrskf = RepeatedStratifiedKFold(n_splits = splits, n_repeats = repeats, random_state = 53)\n\n# Initialize a list to store accuracy scores for each fold\naccuracy_scores = []\n\n# Use the classifier to make predication with repeated stratified k-fold cross validation\nfor train_index, test_index in rskf.split(X_train, Y_train):\n    # Subset the training and testing sets for each iteration\n    X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]\n    Y_fold_train, Y_fold_test = Y_train.iloc[train_index], Y_train.iloc[test_index]\n\n    # Train the classifier\n    # ravel() conv

## Random Forest Confusion Matrix and Model Evaluation

In [10]:
# Calcualte the variance of the accuracy scores
variance_of_accuracies = np.var(accuracy_scores)
print(f'Variance of Accuracy Scores: {variance_of_accuracies: .4f}')

Variance of Accuracy Scores:  0.0000


In [11]:
# *********************** THERE IS A PROBLEM WITH THIS ******************************

# Evaluate the model
# Generate the confusion matrix
confMatrix = confusion_matrix(Y_test, Y_pred_test_RF)

print('Confusion Matrix:')
print(confMatrix)

Confusion Matrix:
[[ 236  959 1044]
 [ 427 3051 3598]
 [ 411 2799 7344]]


In [12]:
# Generate the classification report
print('Classification Report:')
print(classification_report(Y_test, Y_pred_test_RF, target_names=['<30', '>30', 'NO']))

Classification Report:
              precision    recall  f1-score   support

         <30       0.22      0.11      0.14      2239
         >30       0.45      0.43      0.44      7076
          NO       0.61      0.70      0.65     10554

    accuracy                           0.54     19869
   macro avg       0.43      0.41      0.41     19869
weighted avg       0.51      0.54      0.52     19869

