# Practical session on logistic regression

In [1]:
import pandas as pd
!pip install plotly_express
import plotly_express as px
import numpy as np
import matplotlib.pyplot as plt
import os
from mpl_toolkits import mplot3d

from plotly.offline import download_plotlyjs, init_notebook_mode
from plotly.offline import plot, iplot



# General Logistic regression class

In [2]:
#set notebook mode
init_notebook_mode(connected=True)

In [3]:
class LogisticRegression:
    # Z = X*w+p => h(x)
    def __init__(self, x_train, y_train, w, b, numOfTrainingItems, learning_rate):
        self.x_train = x_train
        self.y_train = y_train
        self.w = w
        self.b = b
        self.numOfTrainingItems = numOfTrainingItems
        self.learning_rate = learning_rate

    # Sigmoid Function (Z = w*X+p) .. w=theta1 , p=theta0
    def Sigmoid_Hypothesis(self, w, x, b):
        z = np.dot(x,w)
        return 1 / (1 + np.e ** -z)

    # Cost Function
    def Cost_Function(self):
        hypothesis = self.Sigmoid_Hypothesis(self.w, self.x_train, self.b)
        return np.dot((self.x_train.T, self.y_train - hypothesis))

    # get the values of w,b by gradient descent
    def gradient_descent(self):
        hypothesis = self.Sigmoid_Hypothesis(self.w, self.x_train, self.b)
        self.w += (self.learning_rate * np.dot(self.x_train.T,(self.y_train - hypothesis)) / self.numOfTrainingItems)
        self.b += (self.learning_rate * np.sum(self.y_train - hypothesis) / self.numOfTrainingItems)
        return [self.w, self.b]

    # training the data
    def train(self):
        for i in range(10000):
            self.w, self.b = self.gradient_descent()
        return [self.w, self.b]

    # predict new values
    def predict(self, x_test):
        hypothesis = self.Sigmoid_Hypothesis(self.w, x_test, self.b)
        return [1 if val >= 0.5 else 0 for val in hypothesis]

    def calc_accuracy(self, y_test, y_predicted):
        cnt = 0
        for i in range(len(y_test)):
            if y_test[i] == y_predicted[i]:
                cnt += 1
        return cnt / len(y_test)


# First test on consumer data

In [4]:
# loading the data (req1)
customerData = pd.read_csv('./customer_data.csv')

In [5]:

# shuffle the data
customerData.sample(frac=1)

Unnamed: 0,age,salary,purchased
164,51,134000,0
279,34,72000,0
256,21,16000,0
42,31,74000,0
325,49,141000,1
...,...,...,...
105,47,113000,1
218,30,89000,0
135,32,100000,1
113,28,84000,0


In [6]:
# X => values of features 1,2 (age, salary) & Y => values of output (purchased)
NumberOfFeatures=2
X = customerData.iloc[:, 0:2].values
Y = customerData.iloc[:, 2:3].values
# random initial values for w and b
w = np.random.random((NumberOfFeatures, 1))
b = np.random.random()

In [7]:
# Feature Scaling using minmax normalization
X0_temp = []
X1_temp = []
for i in X:
    X0_temp.append(i[0])
    X1_temp.append(i[1])
minVal = min(X0_temp)
maxVal = max(X0_temp)
for i in range(len(X0_temp)):
    X0_temp[i] = (X0_temp[i] - minVal) / (maxVal - minVal)

minVal = min(X1_temp)
maxVal = max(X1_temp)
for i in range(len(X1_temp)):
    X1_temp[i] = (X1_temp[i] - minVal) / (maxVal - minVal)

X = X.astype(float)
for i in range(len(X)):
    X[i][0] = (X0_temp[i])
    X[i][1] = (X1_temp[i])

In [8]:
# Split the dataset into training and testing sets (req2)
X_Train = X[0:320]
Y_Train = Y[0:320]
X_Test = X[320:]
Y_Test = Y[320:]
numOfTrainingItems = int(X_Train.size / 2)
print("Number of training points: ",numOfTrainingItems)
print("X train shape:",X_Train.shape)
print("Y train shape:",Y_Train.shape)

Number of training points:  320
X train shape: (320, 2)
Y train shape: (320, 1)


In [9]:
# logistic regression (req3)
LogisticReg = LogisticRegression(X_Train, Y_Train, w, b, numOfTrainingItems, 0.2)
w, b = LogisticReg.train()

### TODO: try with other splitting options for data

In [10]:
# predictions on new data (req4)
Y_Predict = LogisticReg.predict(X_Test)
print(f"{Y_Predict}")

# Calculate the accuracy (req5)
print(f"Accuracy : {LogisticReg.calc_accuracy(Y_Test, Y_Predict) * 100}%")

[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]
Accuracy : 58.75%


# Second test on scikitlearn classification data

In [11]:
!pip install scikit-learn



In [12]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0,max_iter=500).fit(X, y)
clf.score(X, y)

0.9733333333333334

### TODO: try with other settings, inspect the scikit-learn documentation