In [1]:
import kagglehub

path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/pima-indians-diabetes-database?dataset_version_number=1...


100%|██████████| 8.91k/8.91k [00:00<00:00, 11.9MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/uciml/pima-indians-diabetes-database/versions/1





In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

path = "/root/.cache/kagglehub/datasets/uciml/pima-indians-diabetes-database/versions/1"

df = pd.read_csv(f"{path}/diabetes.csv")

df.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
import numpy as np

In [4]:
df = df.to_numpy()

In [6]:
df.shape

(768, 9)

In [5]:
x_data = df[:, :-1]
y_data = df[:, -1]

In [7]:
# size = int(x_data.shape[0] * 0.8)
# x_train = x_data[:size]
# y_train = y_data[:size]
# x_test = x_data[size:]
# y_test = y_data[size:]

In [8]:
# x_train_norm = (x_train - np.mean(x_train, axis=0)) / np.std(x_train, axis=0)
# x_test_norm = (x_test - np.mean(x_train, axis=0)) / np.std(x_train, axis=0)

In [9]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

In [10]:
def loss(y_pred,y):
  m = len(y)
  return (-1 / m) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))


In [117]:
def grad_descent_logic(x_train, y_train, epochs=10000, learning_rate=0.00001):
  theta = np.zeros(x_train.shape[1])

  m, n= x_train.shape
  dt = np.zeros(x_train.shape[1])

  for i in range(epochs):
    y_pred = sigmoid(np.dot(x_train, theta))
    cost = loss(y_pred,y_train)
    error = y_pred - y_train
    dt[0] = np.sum(error) / m
    for j in range(1, n):
            dt[j] = np.sum(error * x_train[:, j]) / m
    theta -= learning_rate * dt
  return theta


**Confusion Matrix**

In [102]:
def confusion_matrix(y_true, y_pred):
    TP = np.sum((y_pred == 1) & (y_true == 1))
    TN = np.sum((y_pred == 0) & (y_true == 0))
    FP = np.sum((y_pred == 1) & (y_true == 0))
    FN = np.sum((y_pred == 0) & (y_true == 1))

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")



**K-fold split for cross validation**

In [89]:
def k_fold_split(x_data, y_data, k=5):
    indices = np.arange(len(x_data))   #767 indeces
    np.random.shuffle(indices)

    fold_size = len(x_data) // k  #767/5 = 153
    folds = []

    for i in range(k):
        start = i * fold_size
        end = start + fold_size if i != k - 1 else len(x_data)
        test_idx = indices[start:end]
        train_idx = np.concatenate([indices[:start], indices[end:]])

        x_train, x_test = x_data[train_idx], x_data[test_idx]
        y_train, y_test = y_data[train_idx], y_data[test_idx]

        folds.append((x_train, y_train, x_test, y_test))
    print("k-fold split done")
    return folds

**Cross Validation**

In [81]:
def predict(X, theta):
    return (sigmoid(np.dot(X, theta)) >= 0.5).astype(int)

In [120]:

def cross_validate(x_data, y_data, k=5, epochs=70000, learning_rate=0.0001):
    folds = k_fold_split(x_data, y_data, k)
    accuracies, precisions, recalls, f1_scores = [], [], [], []

    for i, (x_train, y_train, x_test, y_test) in enumerate(folds):
        print(f"\nFold {i+1}/{k}")
        theta = grad_descent_logic(x_train, y_train, epochs, learning_rate)

        y_pred = predict(x_test, theta)

        TP = np.sum((y_pred == 1) & (y_test == 1))
        TN = np.sum((y_pred == 0) & (y_test == 0))
        FP = np.sum((y_pred == 1) & (y_test == 0))
        FN = np.sum((y_pred == 0) & (y_test == 1))

        accuracy = (TP + TN) / (TP + TN + FP + FN)
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

        print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

    print("\nAverage")
    print(f"Accuracy: {np.mean(accuracies):.4f}")
    print(f"Precision: {np.mean(precisions):.4f}")
    print(f"Recall: {np.mean(recalls):.4f}")
    print(f"F1-score: {np.mean(f1_scores):.4f}")



In [121]:
cross_validate(x_data, y_data)

k-fold split done

Fold 1/5
Accuracy: 0.6013, Precision: 0.6250, Recall: 0.2899, F1-score: 0.3960

Fold 2/5
Accuracy: 0.5752, Precision: 0.3125, Recall: 0.3191, F1-score: 0.3158

Fold 3/5
Accuracy: 0.6863, Precision: 0.4565, Recall: 0.4773, F1-score: 0.4667

Fold 4/5
Accuracy: 0.6013, Precision: 0.5000, Recall: 0.3443, F1-score: 0.4078

Fold 5/5
Accuracy: 0.6667, Precision: 0.4324, Recall: 0.3404, F1-score: 0.3810

Average
Accuracy: 0.6261
Precision: 0.4653
Recall: 0.3542
F1-score: 0.3934
