# Dataset Exploration

This notebook will go over the Pima Indians Diabetes dataset, getting information needed for dissertation

In [8]:
# imports
from matplotlib import pyplot as plt
import numpy as np

import os

# import data class
from utilities.dataset_utils import DiabetesData

In [9]:
np.random.seed(42)  # for reproducibility

In [10]:
# path to diabetes.csv
path = os.path.join(os.getcwd(), '..', 'utilities', 'diabetes.csv')
# load dataset class
dataset = DiabetesData(path)

In [11]:
# list of feature names
feature_names = [
    "Pregnancies",
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigreeFunction",
    "Age"
]

In [12]:
data = dataset.get_all_data()
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [13]:
# display min and max values for pregnancies
print(f"Min Pregnancies: {data['Pregnancies'].min()}")
print(f"Max Pregnancies: {data['Pregnancies'].max()}")

Min Pregnancies: 0
Max Pregnancies: 17


In [14]:
# display min and max values for glucose
print(f"Min Glucose: {data['Glucose'].min()}")
print(f"Max Glucose: {data['Glucose'].max()}")

Min Glucose: 0
Max Glucose: 199


In [15]:
# display min and max values for blood pressure
print(f"Min Blood Pressure: {data['BloodPressure'].min()}")
print(f"Max Blood Pressure: {data['BloodPressure'].max()}")

Min Blood Pressure: 0
Max Blood Pressure: 122


In [16]:
# display min and max values for skin thickness
print(f"Min Skin Thickness: {data['SkinThickness'].min()}")
print(f"Max Skin Thickness: {data['SkinThickness'].max()}")

Min Skin Thickness: 0
Max Skin Thickness: 99


In [17]:
# display min and max values for insulin
print(f"Min Insulin: {data['Insulin'].min()}")
print(f"Max Insulin: {data['Insulin'].max()}")

Min Insulin: 0
Max Insulin: 846


In [18]:
# display min and max values for BMI
print(f"Min BMI: {data['BMI'].min()}")
print(f"Max BMI: {data['BMI'].max()}")

Min BMI: 0.0
Max BMI: 67.1


In [19]:
# display min and max values for diabetes pedigree function
print(f"Min Diabetes Pedigree Function: {data['DiabetesPedigreeFunction'].min()}")
print(f"Max Diabetes Pedigree Function: {data['DiabetesPedigreeFunction'].max()}")

Min Diabetes Pedigree Function: 0.078
Max Diabetes Pedigree Function: 2.42


In [20]:
# display min and max values for age
print(f"Min Age: {data['Age'].min()}")
print(f"Max Age: {data['Age'].max()}")

Min Age: 21
Max Age: 81


In [21]:
# count of positive and negative samples
print(f"Count of Positive Samples: {data['Outcome'].sum()}")
print(f"Count of Negative Samples: {len(data) - data['Outcome'].sum()}")

Count of Positive Samples: 268
Count of Negative Samples: 500
