In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as mse
import math


In [6]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

INFERENCE = False

np.random.seed(5)


# Data Preprocessing

In [7]:
data_path = "diabetes.csv"
dataset = pd.read_csv(data_path)
# split data into test and training set
display(dataset)
# delete duplicate
dataset = dataset.drop_duplicates()

print(len(dataset))
dataset = dataset[dataset["BMI"] != 0]
dataset = dataset[dataset["BloodPressure"] != 0]

print(len(dataset))

# split data with equal class distrobution
def split_dataset(dataset,train_fraction=0.8):
    train = dataset.sample(frac=train_fraction)
    test = dataset.drop(train.index)
    return train,test


# to maintain class distrobution
train,test = split_dataset(dataset)
train,validation = split_dataset(train)


# posative_samples = dataset[dataset.Outcome == 1]
# negative_samples = dataset[dataset.Outcome == 0]

# validation_frac,test_frac,train_frac = (0.1,0.2,0.7)


# validation_p = posative_samples.sample(frac=validation_frac)
# validation_n = negative_samples.sample(frac=validation_frac)

# posative_samples = posative_samples.drop(validation_p.index)
# negative_samples = negative_samples.drop(validation_n.index)

# validation = pd.concat([validation_p,validation_n])


# train_p = posative_samples.sample(frac=train_frac)
# train_n = negative_samples.sample(frac=train_frac)

# posative_samples = posative_samples.drop(train_p.index)
# negative_samples = negative_samples.drop(train_n.index)

# train = pd.concat([train_p,train_n])


# test_p = posative_samples.sample(frac=test_frac)
# test_n = negative_samples.sample(frac=test_frac)

# posative_samples = posative_samples.drop(test_p.index)
# negative_samples = negative_samples.drop(test_n.index)

# test = pd.concat([test_p,test_n])


# split test and train into features(x) and labels(y)



train_labels = train["Outcome"]
train_features = train.drop(columns=["Outcome"])

test_labels = test["Outcome"]
test_features = test.drop(columns=["Outcome"])

validation_labels = validation["Outcome"]
validation_features = validation.drop(columns=["Outcome"])


# plots data
# fig, ax = plt.subplots(figsize=(12, 12))
# sns.pairplot(data=train,hue="Outcome")

# print distrobution across sets
print(len(validation[validation["Outcome"] == 1])/len(validation))
print(len(test[test["Outcome"] == 1])/len(test))
print(len(train[train["Outcome"] == 1])/len(train))
# display sample of the data
print("Sample of test data:")
display(test.sample(10))



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


768
729
0.36752136752136755
0.3767123287671233
0.3283261802575107
Sample of test data:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
68,1,95,66,13,38,19.6,0.334,25,0
279,2,108,62,10,278,25.3,0.881,22,0
746,1,147,94,41,0,49.3,0.358,27,1
672,10,68,106,23,49,35.5,0.285,47,0
227,3,162,52,38,0,37.2,0.652,24,1
710,3,158,64,13,387,31.2,0.295,24,0
350,4,92,80,0,0,42.2,0.237,29,0
212,7,179,95,31,0,34.2,0.164,60,0
290,0,78,88,29,40,36.9,0.434,21,0
411,1,112,72,30,176,34.4,0.528,25,0


# Custom Model

In [8]:
class LR:

    # y = β0X0 + β1X1 + β2X2+… βnXn+ ε, where β1 to βn and ε are regression coefficients.
    # get sample count for each set
    def __init__(self, learning_rate=10e-4, n_epochs=50, cutoff=0.5):

        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.cutoff = cutoff

        self.coefficents = None
        self.bias = 0.0

        self.best_coefficents = None
        self.max_score = 0

    # for training :
    def sigmoid(self, z):

        return 1.0 / (1.0 + np.exp(-z))

    def predict_proba(self, feature_sample,inference=False):

        if inference:
            z = np.dot(feature_sample, self.best_coefficents) + self.bias

        else:
            z = np.dot(feature_sample, self.coefficents) + self.bias
        return self.sigmoid(z)

    def fit(self,x,y):
        if not isinstance(x, np.ndarray):
            x = x.to_numpy()

        if not isinstance(y, np.ndarray):
            y = y.to_numpy()
        # init weights with zeros
        # unlike NNs, because of the nature of the sigmoid function its best to init weights at zero
        self.coefficents = np.zeros(x.shape[1])
        self.cost = []

        self.feature_count = x.shape[0]
        self.log_loss = {}
        self.cost = []

        losses = []
        validation_accuracy = []
        validation_f1 = []
        for n_epoch in range(1, self.n_epochs + 1):


            # compare current predictions with the known lables and get loss
            # accuracy is not a good metric here because the dataset is unbalanced
            # so it leads the model to blindly label everything as the majority class
            # loss = self.cross_entropy_loss(labels=self.labels,predictions=hypothesis)
            for i in range(self.feature_count):
                # Calculate the probability value for a feature_sample
                predictions = self.predict_proba(x[i])
                # Calculate the gradient bias
                grad_bias = predictions - y[i]
                # Calculate the gradient coefficents
                grad_weights=  x[i] * (predictions - y[i])

                self.coefficents -= self.learning_rate * grad_weights / self.feature_count
                # Update the value of b
                self.bias -= self.learning_rate * grad_bias / self.feature_count


                # Calculate the log-loss
                #Cross-entropy loss is a metric used in machine learning to measure how well a classification model performs.
                #  It measures the difference between two probability distributions.
                #  The goal is generally to get your model as close to 0 as possible.
                # The cross-entropy loss attains its minimum when the predicted probability p
                #  is close to the true value and is substantially higher when the predicted probability
                #  is far away from the true label1
            loss = -1/self.feature_count * (y[i] * np.log(predictions) + (1 - y[i]) * np.log(1 - predictions))
            losses.append(loss)
            score_value = self.score(validation_features,validation_labels)
            validation_accuracy.append(score_value["accuracy"])
            # validation_f1.append(score_value["f1"])


            # Calculate the cost fuction
            # The F1 score is a measure of a model’s accuracy on a dataset.
            #  It is used to evaluate binary classification systems,
            # which classify examples into ‘positive’ or ‘negative’.
            # The F1 score is a way of combining the precision and recall of the model,
            #  and it is defined as the harmonic mean of the model’s precision and recall.
            #  The F1 score can be interpreted as a measure of overall model performance from 0 to 1,
            #  where 1 is the best. To be more specific,
            #  F1 score can be interpreted as the model’s balanced ability to both capture positive cases (recall)
            #  and be accurate with the cases it does capture (precision)
            print(f"{bcolors.OKBLUE}iteration: {n_epoch} loss: {bcolors.ENDC}{loss}")
            print(f"{bcolors.OKBLUE}validation_accuracy - validation_f1: {bcolors.ENDC}{self.score(test_features,test_labels)}")

            self.cost.append(sum(losses))

            if self.max_score < score_value["accuracy"]:
                self.best_coefficents = self.coefficents



    # for validation :
    def predict(self, x,inference=False):
        if not isinstance(x, np.ndarray):
            x = x.to_numpy()

        self.predict_probas = []
        for i in range(x.shape[0]):
            ypred = self.predict_proba(x[i],inference=inference)
            self.predict_probas.append(ypred)

        return (np.array(self.predict_probas) >= self.cutoff) * 1.0

    def score(self, feature, label):

        ypred = self.predict(feature)
        label = label.to_numpy()
        # accuracy: using xnor predict and label
        # f1_score : using TP , FP , FN
        return {"accuracy":accuracy_score(label, ypred),"f1":f1_score(label,ypred)}




model = LR()
model.fit(train_features,train_labels)

[94miteration: 1 loss: [0m0.0009998773848823393
[94mvalidation_accuracy - validation_f1: [0m{'accuracy': 0.6232876712328768, 'f1': 0.0}
[94miteration: 2 loss: [0m0.0009593668358890691
[94mvalidation_accuracy - validation_f1: [0m{'accuracy': 0.6232876712328768, 'f1': 0.0}
[94miteration: 3 loss: [0m0.0009428689477159509
[94mvalidation_accuracy - validation_f1: [0m{'accuracy': 0.6232876712328768, 'f1': 0.0}
[94miteration: 4 loss: [0m0.0009314659802818775
[94mvalidation_accuracy - validation_f1: [0m{'accuracy': 0.6301369863013698, 'f1': 0.03571428571428572}
[94miteration: 5 loss: [0m0.0009217472169696036
[94mvalidation_accuracy - validation_f1: [0m{'accuracy': 0.6301369863013698, 'f1': 0.03571428571428572}
[94miteration: 6 loss: [0m0.0009130580766492739
[94mvalidation_accuracy - validation_f1: [0m{'accuracy': 0.636986301369863, 'f1': 0.07017543859649122}
[94miteration: 7 loss: [0m0.0009052368854218207
[94mvalidation_accuracy - validation_f1: [0m{'accuracy': 0.63

# Sklearn Model vs Custom Model

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# X_train, y_train are the training data and labels
# X_eval, y_eval are the evaluation data and labels


y_pred = model.predict(test_features,inference=True)
acc = accuracy_score(test_labels,y_pred)
f1 = f1_score(test_labels, y_pred)
print(f'OUR CODE : Accuracy: {acc} -F1 score {f1}')

clf = LogisticRegression(max_iter=100000).fit(train_features, train_labels)
y_pred = clf.predict(test_features)
acc = accuracy_score(test_labels,y_pred)
f1 = f1_score(test_labels, y_pred)
print(f'SKLEARN : Accuracy: {acc} -F1 score {f1}')

OUR CODE : Accuracy: 0.6986301369863014 -F1 score 0.4358974358974359
SCALER : Accuracy: 0.7534246575342466 -F1 score 0.5909090909090909


# Inference

In [None]:
print(f"{bcolors.HEADER}{bcolors.BOLD}Diabetes Detection Using Logistic Regression{bcolors.ENDC}\n")
print(f"{bcolors.OKBLUE}Enter information as requested. {bcolors.ENDC}")

INFERENCE = False
input_features = []


for info in train_features.columns:
    print(f"{bcolors.OKBLUE}{info}: {bcolors.ENDC}")
    feature = float(input())
    input_features.append(feature)
input_features = np.array(input_features)

0

inference = model.predict_proba(input_features,inference=True)


# np.array(input_features[:-1]).reshape(1, -1)

print(f"{bcolors.BOLD}We estimate there is a {round(inference*100,2)}% chance you have diabetes.")

print(f"Output of sklearn.LogisticRegression: {clf.predict(np.array(input_features).reshape(1,-1))}")

