In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
datfra = pd.read_csv("D:/ASU/IFT 511/IFT 511/Project/diabetes_binary_5050split_health_indicators_BRFSS2015.csv", encoding='latin-1')

# Check the dimensions of the dataset
print("Dataset shape: ", datfra.shape)

# Check for missing values and drop rows or columns with missing values
# Use the dropna() method to remove any missing values
# Explain any removed row/column and the number of missing values in it
datfra = datfra.dropna()

# Get the features and target columns from the dataset
San = datfra.iloc[:, :-1]
Raf = datfra.iloc[:, -1]

# Use StratifiedKFold to randomly split the data into K equal folds
strakf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create lists to store the accuracy scores for Gini and Entropy
gin_scr = []
entro_scr = []

# Create a for loop that iterates over the 5 folds
for fold, (train_index, test_index) in enumerate(strakf.split(San, Raf)):
    print("Fold:", fold+1)

    # Get the training and testing data for this fold
    San_train, San_test = San.iloc[train_index], San.iloc[test_index]
    Raf_train, Raf_test = Raf.iloc[train_index], Raf.iloc[test_index]

    # Create a decision tree classifier
    dcttree = DecisionTreeClassifier(max_depth=San.shape[1]*10)


    # Create a parameter grid for GridSearchCV
    paramet_grd = {'criterion': ['gini', 'entropy'],
                  'max_depth': [10, 20, 30]}

    # Use GridSearchCV to find the best parameter values
    grd_srch = GridSearchCV(dcttree, paramet_grd, cv=4, scoring='accuracy')
    grd_srch.fit(San_train, Raf_train)

    # Get the best parameter values
    bst_crit = grd_srch.best_params_['criterion']
    bst_dpth = grd_srch.best_params_['max_depth']
    print("Best parameters:", bst_crit, bst_dpth)

    # Train the decision tree classifier with the best parameter values
    dcttree = DecisionTreeClassifier(criterion=bst_crit, max_depth=bst_dpth)
    dcttree.fit(San_train, Raf_train)

    # Test the decision tree classifier and compute the accuracy score
    Raf_pred = dcttree.predict(San_test)
    accrcy = accuracy_score(Raf_test, Raf_pred)
    print("Accuracy:", accrcy)

    # Store the accuracy score for this fold and impurity measure
    if bst_crit == 'gini':
        gin_scr.append(accrcy)
    else:
        entro_scr.append(accrcy)


# Compute the overall accuracy for Gini and Entropy
gini_accrcy = np.mean(gin_scr)
entropy_accrcy = np.mean(entro_scr)

# Print the overall accuracy for Gini and Entropy
print("Gini accuracy:", gini_accrcy)
print("Entropy accuracy:", entropy_accrcy)

# Determine which impurity measure gave the best results
if gini_accrcy > entropy_accrcy:
    print("Gini gave the best results")
else:
    print("Entropy gave the best results")

Dataset shape:  (70692, 22)
Fold: 1
Best parameters: entropy 10
Accuracy: 0.3341820496499045
Fold: 2
Best parameters: entropy 10
Accuracy: 0.3302920998656199
Fold: 3
Best parameters: gini 10
Accuracy: 0.3261423115009195
Fold: 4
Best parameters: entropy 10
Accuracy: 0.33597397085867875
Fold: 5
Best parameters: gini 10
Accuracy: 0.3313764323100863
Gini accuracy: 0.32875937190550286
Entropy accuracy: 0.333482706791401
Entropy gave the best results
