In [None]:
# CS634 Final Term Project: Diabetes Prediction

## Overview
This project compares three machine learning models—Random Forest, LSTM, and KNN—for predicting diabetes based on a dataset. The notebook demonstrates:
- Data preprocessing steps.
- Training and evaluation of the models.
- Performance metrics and visualizations for comparison.


import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(input_path, output_folder):
    """
    Preprocess the dataset:
    - Handle missing or invalid values.
    - Normalize features using StandardScaler.
    - Split data into training and testing sets.
    - Save preprocessed datasets to CSV files.


    """
    print("Starting preprocessing.......")

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")

    df = pd.read_csv(input_path)
    print("Data loaded successfully")

    columns_to_impute = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
    df[columns_to_impute] = df[columns_to_impute].replace(0, pd.NA)
    df[columns_to_impute] = df[columns_to_impute].fillna(df[columns_to_impute].median())

    df["Outcome"] = df["Outcome"].apply(lambda x: 1 if x == 1 else 0)

    print("Missing values handled")

    X = df.drop("Outcome", axis=1)
    y = df["Outcome"]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    print("Data split into training and testing sets")

    pd.DataFrame(X_train).to_csv(f"{output_folder}/X_train.csv", index=False)
    pd.DataFrame(X_test).to_csv(f"{output_folder}/X_test.csv", index=False)
    pd.DataFrame(y_train).to_csv(f"{output_folder}/y_train.csv", index=False)
    pd.DataFrame(y_test).to_csv(f"{output_folder}/y_test.csv", index=False)

    print("Preprocessed data saved successfully in", output_folder)

if __name__ == "__main__":
    preprocess_data(input_path="data/diabetes.csv", output_folder="data")


knn_model.py



import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import os

def train_knn():
    """
    Train a K-Nearest Neighbors (KNN) model using preprocessed data and evaluate its performance.
    """
    X_train = pd.read_csv("data/X_train.csv")
    X_test = pd.read_csv("data/X_test.csv")
    y_train = pd.read_csv("data/y_train.csv").values.ravel()
    y_test = pd.read_csv("data/y_test.csv").values.ravel()

    knn = KNeighborsClassifier(n_neighbors=5)

    knn.fit(X_train, y_train)
    print("KNN model trained successfully.")

    y_pred = knn.predict(X_test)
    y_prob = knn.predict_proba(X_test)[:, 1]

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    roc_auc = roc_auc_score(y_test, y_prob)
    print(f"ROC-AUC Score: {roc_auc:.2f}")

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})", color="darkorange")
    plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve - KNN")
    plt.legend(loc="lower right")
    plt.grid()

    output_folder = "outputs"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    plot_path = os.path.join(output_folder, "roc_curve_knn.png")
    plt.savefig(plot_path)
    print(f"ROC curve saved as {plot_path}")
    plt.show()

if __name__ == "__main__":
    train_knn()


lstm_model.py

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import os

def train_lstm():
    """
    Train an LSTM model using preprocessed data and evaluate its performance.
    """
    X_train = pd.read_csv("data/X_train.csv").values
    X_test = pd.read_csv("data/X_test.csv").values
    y_train = pd.read_csv("data/y_train.csv").values
    y_test = pd.read_csv("data/y_test.csv").values

    X_train = np.expand_dims(X_train, axis=1)
    X_test = np.expand_dims(X_test, axis=1)

    
    model = Sequential([
        LSTM(64, activation='tanh', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
        Dropout(0.2),
        LSTM(32, activation='tanh'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    print("LSTM model compiled successfully.")

    
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)
    print("LSTM model trained successfully.")

    
    y_prob = model.predict(X_test).ravel()
    y_pred = (y_prob > 0.5).astype(int)

   
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    
    roc_auc = roc_auc_score(y_test, y_prob)
    print(f"ROC-AUC Score: {roc_auc:.2f}")

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})", color="darkorange")
    plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve - LSTM")
    plt.legend(loc="lower right")
    plt.grid()

    
    output_folder = "outputs"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    
    plot_path = os.path.join(output_folder, "roc_curve_lstm.png")
    plt.savefig(plot_path)
    print(f"ROC curve saved as {plot_path}")
    plt.show()

if __name__ == "__main__":
    train_lstm()



random_forest.py


import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import os

def train_random_forest():
    
    X_train = pd.read_csv("data/X_train.csv")
    X_test = pd.read_csv("data/X_test.csv")
    y_train = pd.read_csv("data/y_train.csv").values.ravel()
    y_test = pd.read_csv("data/y_test.csv").values.ravel()

    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    rf.fit(X_train, y_train)
    print("Random Forest model trained successfully.")

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)[:, 1]

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    roc_auc = roc_auc_score(y_test, y_prob)
    print(f"ROC-AUC Score: {roc_auc:.2f}")

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})", color="darkorange")
    plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve - Random Forest")
    plt.legend(loc="lower right")
    plt.grid()

    # Ensure outputs folder exists
    output_folder = "outputs"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    plot_path = os.path.join(output_folder, "roc_curve_rf.png")
    plt.savefig(plot_path)
    print(f"ROC curve saved as {plot_path}")
    plt.show()

if __name__ == "__main__":
    train_random_forest()
