In [20]:
# Setup
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
# Defining the Regression Model from scratch
class Logistic_Regression():
  def __init__(self, learning_rate, no_of_iterations):

    self.learning_rate = learning_rate
    self.no_of_iterations = no_of_iterations

  def fit(self, X, Y):
    # number of data points in the dataset (number of rows)  -->  m
    # number of input features in the dataset (number of columns)  --> n
    self.m, self.n = X.shape

    #initiating weight & bias value
    self.w = np.zeros(self.n)
    self.b = 0
    self.X = X
    self.Y = Y

    # implementing Gradient Descent
    for i in range(self.no_of_iterations):
      self.update_weights()

  def update_weights(self):
    # sigmoid formula (sigmoid function)
    a = 1/(1 + np.exp(-(self.X.dot(self.w) + self.b)))
    dw = (1/self.m) * np.dot(self.X.T, (a - self.Y))
    db = (1/self.m) * np.sum(a - self.Y)

    # updating using gradient descent
    self.w = self.w - self.learning_rate * dw
    self.b = self.b - self.learning_rate * db

  def predict(self, X):
    Y_pred = 1 / (1 + np.exp( - (X.dot(self.w) + self.b ) ))
    Y_pred = np.where( Y_pred > 0.5, 1, 0)
    return Y_pred

In [22]:
# Importing the dataset as a dataframe
diabetes_dataset = pd.read_csv('diabetes.csv')

In [23]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [24]:
diabetes_dataset.shape

(768, 9)

In [25]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [26]:
features = diabetes_dataset.drop(columns = 'Outcome', axis=1)
target = diabetes_dataset['Outcome']

In [27]:
# Implementing Scaler
scaler = StandardScaler()
scaler.fit(features)
scaled_data = scaler.transform(features)

In [28]:
features = scaled_data

In [29]:
# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.2, random_state=2)

In [30]:
# Calling the Regression model
model = Logistic_Regression(learning_rate=0.01, no_of_iterations=1000)
model.fit(X_train, Y_train)

In [31]:
X_train_pred = model.predict(X_train)
training_accuracy = accuracy_score( Y_train, X_train_pred)

In [32]:
print('Accuracy score of the training data : ', training_accuracy)

Accuracy score of the training data :  0.7768729641693811


In [33]:
X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score( Y_test, X_test_pred)

In [34]:
print('Accuracy score of the test data : ', test_accuracy)

Accuracy score of the test data :  0.7662337662337663
