In [52]:
# Importing Modules
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import confusion_matrix, classification_report

## Data Collection

The following dataset was downloaded from Kaggle, where it was originally from the National Institute of Diabetes and Digestive and Kidney Diseases: https://www.kaggle.com/datasets/mathchi/diabetes-data-set

In [30]:
# Reading Dataset

df = pd.read_csv("diabetes.csv")

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Preprocessing

In [31]:
# Attempting to drop rows with NaN columns
print("Number of Observations before dropping missing values: ", df.shape[0])
df = df.dropna()
print("Number of Observations after dropping missing values: ", df.shape[0])

Number of Observations before dropping missing values:  768
Number of Observations after dropping missing values:  768


This shows us that the input data for every feature for every observation is already filled in. Let us find outliers using another method. We can use z-scores to find and eliminate observations with a z-score between -3 and 3.

In [32]:
# Calculating Z-score related to the columns
zScores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
threshold = 3

# Finding rows where column exceeds threshold
outliers = (zScores > threshold).any(axis=1)


outliersDf = df[outliers]

print("Number of Outliers:", outliersDf.shape[0])

Number of Outliers: 80


By using z-score with a threshold of 3, we have identified around 80 outliers for our data. Now let us remove these outliers from our dataframe and standardize the data.

In [47]:
noOutliersDf = df[~outliers]

updatedDf = noOutliersDf.drop(["Outcome"], axis=1)  # Removing outcome column because we do not want it to be part of standarization.

# Creating a scaler to standardize numeric data in columns
scaler = StandardScaler()

colsNumeric = updatedDf.select_dtypes(include=[np.number]).columns
updatedDf[colsNumeric] = scaler.fit_transform(updatedDf[colsNumeric])

updatedDf.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.657355,0.92404,-0.028115,0.923219,-0.805266,0.210285,0.606516,1.47922
1,-0.86849,-1.177082,-0.515765,0.533462,-0.805266,-0.848063,-0.36422,-0.183265
2,1.267694,2.09133,-0.678315,-1.350366,-0.805266,-1.346999,0.764788,-0.095766
3,-0.86849,-1.043678,-0.515765,0.143704,0.238698,-0.621274,-1.011378,-1.058257
5,0.352186,-0.143197,0.134435,-1.350366,-0.805266,-0.999256,-0.891795,-0.270764


## Modeling

Here, we will implement a KNN algorithm for our diabetes dataset. The other algorithms will be done on RapidMiner

In [50]:
# Re-importing the original dataset to get the 'Outcome' column

# Extracting the 'Outcome' column
outcomeCol = df['Outcome']
noOutliersDf = outcomeCol[~outliers]

# updatedDf['Outcome'] = noOutliersDf  # Adding outcome column back to updated Dataframe


# Defining features and class variables
X = updatedDf
y = noOutliersDf.reset_index(drop=True)

# Now we have our feature set X and target y ready for the kNN implementation
X.shape, y.shape

# Creating test/train split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Had issue with "IndexError: single positional indexer is out-of-bounds", so resetting indexes to resolve
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Euclidean Distance for KNN
def euclidDist(r1, r2):
    return np.sqrt(np.sum((r1 - r2) ** 2))

# Finding KNN
def findNeighbors(X_train, test_row, k):
    dists = []
    for idx, train_row in X_train.iterrows():
        dist = euclidDist(test_row, train_row)
        dists.append((idx, dist))
    dists.sort(key=lambda x: x[1])
    neighbors = dists[:k]
    return [X_train.iloc[idx] for idx, _ in neighbors]


# predicting class based on NN
def classificationPred(X_train, y_train, test_row, k):
    neighbors = findNeighbors(X_train, test_row, k)
    labelsNeighbor = [y_train.iloc[idx] for idx in [neighbor.name for neighbor in neighbors]]
    prediction = Counter(labelsNeighbor).most_common(1)[0][0]
    return prediction

k = 3  # k = 3 as test case
y_pred = X_test.apply(lambda x: classificationPred(X_train, y_train, x, k), axis=1)

y_pred.head()

0    0
1    0
2    0
3    1
4    1
dtype: int64

In [60]:

confMat = confusion_matrix(y_test, y_pred)

# Extracting TP, FP, FN, and TN
TP, FP, FN, TN = confMat[1, 1], confMat[0, 1], confMat[1, 0], confMat[0, 0]

report = classification_report(y_test, y_pred, output_dict=True)

acc = report['accuracy']
prec = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']

print("KNN Algorithm Performance Results")
print("Confusion Matrix:\n", confMat)
print("\nTrue Positives:", TP)
print("False Positives:", FP)
print("False Negatives:", FN)
print("True Negatives:", TN, "\n\n")
print("Accuracy for KNN Model:", f"{acc:.3f}")
print("Precision for KNN Model:", f"{prec:.3f}")
print("Recall for KNN Model:", f"{recall:.3f}")
print("F1 Score for KNN Model:", f"{f1:.3f}")

KNN Algorithm Performance Results
Confusion Matrix:
 [[71 14]
 [25 28]]

True Positives: 28
False Positives: 14
False Negatives: 25
True Negatives: 71 


Accuracy for KNN Model: 0.717
Precision for KNN Model: 0.712
Recall for KNN Model: 0.717
F1 Score for KNN Model: 0.710


Outputting dataframe as csv file for other algorithm methods (SVM, Decision Tree, Random Forest, & Gradient Boosting) that will be implemented on RapidMiner

In [51]:
updatedDf['Outcome'] = noOutliersDf

updatedDf.to_csv("PreprocessedDiabetes.csv", index=False)