In [1]:
"""
CS6140 Assignment 2
Question 3 - Logistic Regression and Perceptron
Wing Man, Kwok
Jun 3 2022
"""

'\nCS6140 Assignment 2\nQuestion 3 - Logistic Regression and Perceptron\nWing Man, Kwok\nJun 3 2022\n'

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split    #Library to split training and testing data
from sklearn.metrics import classification_report       #Library to compute accuracy, precison and recall

In [3]:
# Read the dataset
def load_data(data_dir):
  ''' features_names: input features names
      features_data: input features
      features_names: output features names
      features_labels: output features
  '''
  dataset = pd.read_csv(data_dir)  

  features_names = dataset.columns.values[:-1]        #Equipvalent to feature_cols = ['feature1', 'feature2', 'feature3', 'feature4']
  features_data = dataset[features_names] 
  labels_names = dataset.columns.values[-1]   
  labels_data = dataset[labels_names]

  return features_names, features_data, labels_names, labels_data

In [4]:
# Part 1 - Implementation of sigmoid function
def sigmoid(z):
  """The sigmoid function."""
  sigmoid = 1.0/(1.0+np.exp(-z))
  sigmoid = np.minimum(sigmoid, 0.9999)  # Set upper bound
  sigmoid = np.maximum(sigmoid, 0.0001)  # Set lower bound
  return sigmoid

In [5]:
# Implementation of cost function
def compute_cost(ip, op, params):
  """
  Cost function in linear regression where the cost is calculated
  ip: input variables
  op: output variables
  params: corresponding parameters
  nll : negative log likelihood
  y_hat : y hat
  gradient : gradient
  Returns cost, gradient
  """
  
  num_of_samples = len(op)

  y_hat = sigmoid(np.dot(ip, params))
  nll = sum((-op * np.log(y_hat)) - ((1- op)*np.log(1-y_hat)))  
  cost = nll
  gradient = np.dot(ip.transpose(), (y_hat - op))  

  return cost, gradient

In [6]:
#Part 2 - Implement logistic regression using batch gradient descent and evaluation 
def logistic_regression_using_batch_gradient_descent(ip, op, params, alpha, num_iter, batch_size = 1):
  """
  Compute the params for logistic regression using batch gradient descent
  ip: input variables
  op: output variables
  params: corresponding parameters
  alpha: learning rate
  num_iter: number of iterations
  cost, cost_list: error function, error function in list format
  gradient: gradient
  Returns parameters, cost
  """ 
  # initialize iteration, number of samples, cost and parameter array
  cost_list = []
  
  #batchify the data into mini-batches
  pass
  
  # Compute the cost and store the params for the corresponding cost
  for i in range(num_iter):
      cost, gradient = compute_cost(ip, op, params)
      params = params - (alpha * gradient)
      cost_list.append(cost)

  return params, cost_list

In [7]:
#Data preparation Version 1 : normalize data by z-score
def Normalize_zscore(X):
  mean = np.mean(X,axis=0)
  std = np.std(X,axis=0)
  X_norm = (X - mean)/std
  
  return X_norm 

In [8]:
#Data preparation Version 2 : normalize data by min max normalization
def Normalize_minmax(X):
  min = np.min(X, axis=0)
  max = np.max(X, axis=0)
  X_norm = (X - min)/(max - min)
  
  return X_norm

In [9]:
#Logistic Regression predictor
def Predict(theta,X):
    predictions = X.dot(theta)

    return np.where(predictions >= 0.5, 1, 0)

In [10]:
#Part 3 - Implementation of perceptron
class Perceptron:
  # constructor 
  def __init__ (self, num_of_x_features, alpha):
    #self.weight = np.random.randn(num_of_x_features + 1) / np.sqrt(num_of_x_features)
    self.weight = np.zeros((num_of_features+1))
    self.b = None
    self.alpha = alpha

  # model
  def classify(self, x):
    return 1 if x > 0 else 0

  # fitting the model by batch gradient decent
  def fit(self, X, Y, epochs): 
    gradient = 0
    for epoch in range(epochs):
      for (x, y) in zip(X, Y):
        y_hat = self.classify(np.dot(x, self.weight))
        if y_hat != y:
          gradient += (y - y_hat) * x
      self.weight = self.weight + self.alpha * gradient

  """
  # fitting the model by SGD
  def fit(self, X, y, epochs): 

    for epoch in range(epochs):
      #gradient = 0
      for (x, y) in zip(X, Y):
        y_hat = self.classify(np.dot(x, self.weight))
        if y_hat != y:
          gradient = y - y_hat
          self.weight = self.weight + self.alpha * gradient * x
  """
 
  # predictor to predict on the data based on weight
  def predict(self, X):
    return self.classify(np.dot(X, self.weight))

In [11]:
#Part 4 - Evaluate regression performance
def evaluate(y_test, y_predict):
  '''return the accuracy scores'''
  
  #compute accuracy
  y_test_array = y_test.to_numpy()    #convert dataframe to numpy array
  y_predict = y_predict.transpose()
  total_num_samples = y_test.shape[0]

  matches_count = np.count_nonzero(y_test_array == y_predict)   #return 1 to np.count, if y_true == y_predict, then count matches
  accuracy = matches_count/total_num_samples

  #compute precision
  true_positive = np.sum((y_test_array == 1) & (y_predict == 1))
  false_positive = np.sum((y_test_array == 0) & (y_predict == 1))
  false_negative = np.sum((y_test_array == 1) & (y_predict == 0))
  true_negative = np.sum((y_test_array == 0) & (y_predict == 0))
  precision = true_positive/(true_positive + false_positive)

  confusion_matrix = pd.DataFrame({"Negative":[true_negative, false_negative], "Positive":[false_positive, true_positive]}, index=['Negative', 'Positive'])
  print("\nConfusion Matrix")
  print("----------------------------------------------------\n" + "\t\t\tPredicted")
  print("Actual", confusion_matrix, "\n")
  print("matches:", matches_count, "total number of samples:", total_num_samples)
  
  #compute recall
  recall = true_positive/(true_positive + false_negative)

  #put computation result into dataframe
  performance_matrix = pd.DataFrame({"accuracy":[accuracy], "precision":[precision], "recall":[recall], "total number of samples":[total_num_samples]})
  print("\nLogistic Regression Performance")
  print("---------------------------------------------------")
  print(performance_matrix.to_string(index=False), "\n")

  return

In [12]:
#Train the model
#reserve the test data, do not use them for cross-validation!

#load data
DATA_DIR = "/content/drive/My Drive/Colab Notebooks/CS6140 Assignment2/default of credit card clients2.csv"
features_names, features_data, labels_name, labels_data = load_data(DATA_DIR)

#split data
x_train, x_test, y_train, y_test = train_test_split(features_data, labels_data, test_size=0.2, random_state = 42) #x_train datatype <class 'pandas.core.frame.DataFrame'>

#setup numpy print environment
np.set_printoptions(suppress=True)      #with np set to printoptions, the printout would not be in scientific e to the power of x format
#np.set_printoptions(threshold=np.inf)

In [13]:
'''
#compare results with sklearn logistic regression, raw, logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.pipeline import make_pipeline

x_train_numpy = x_train.to_numpy()
y_train_numpy = y_train.to_numpy()
x_test_numpy = x_test.to_numpy()
y_test_numpy = y_test.to_numpy()

clf = LogisticRegression(class_weight = 'balanced')   #since there are 1100 "1" and rest are 0 from dataset, the prediction tends to give all 0, until i put the balanced parameters
clf.fit(x_train_numpy, y_train_numpy)
y_pred = clf.predict(x_test_numpy)
print("y_pred", y_pred)
print(accuracy_score(y_pred, y_test_numpy))
'''

'\n#compare results with sklearn logistic regression, raw, logistic regression\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import confusion_matrix, accuracy_score \nfrom sklearn.pipeline import make_pipeline\n\nx_train_numpy = x_train.to_numpy()\ny_train_numpy = y_train.to_numpy()\nx_test_numpy = x_test.to_numpy()\ny_test_numpy = y_test.to_numpy()\n\nclf = LogisticRegression(class_weight = \'balanced\')   #since there are 1100 "1" and rest are 0 from dataset, the prediction tends to give all 0, until i put the balanced parameters\nclf.fit(x_train_numpy, y_train_numpy)\ny_pred = clf.predict(x_test_numpy)\nprint("y_pred", y_pred)\nprint(accuracy_score(y_pred, y_test_numpy))\n'

In [14]:
#predict by logistic regression - normalized data
num_of_features = x_train.shape[1]      #x_train.shape (3984, 25)

#normalize data
x_train_minmax_normalized = Normalize_minmax(x_train)
x_test_minmax_normalized = Normalize_minmax(x_test)

#add bias column
x_train_minmax_normalized = np.append(np.ones((x_train_minmax_normalized.shape[0],1)), x_train_minmax_normalized, axis=1)       #x_train datatype <class 'numpy.ndarray'>, append bias with all ones at the end of dataset
x_test_minmax_normalized = np.append(np.ones((x_test_minmax_normalized.shape[0],1)), x_test_minmax_normalized, axis=1)          #x_train datatype <class 'numpy.ndarray'>, append bias with all ones at the end of dataset

#reshape true label for calculation
y_train_reshaped = y_train.values.reshape(x_train.shape[0],1)
y_test_reshaped = y_test.values.reshape(x_test.shape[0],1)

#Initialize parameters
alpha = 1
num_iters = 200
batch_size = 1
params = np.zeros((num_of_features+1,1))      #initialize params as 0, by rows as num of x_train columns, and 1 single column

#train the model by batch gradient decent
params, cost = logistic_regression_using_batch_gradient_descent(x_train_minmax_normalized, y_train_reshaped, params, alpha, num_iters, batch_size)

#test the model, get prediction
y_predict = Predict(params, x_test_minmax_normalized)   #has to transpose y_predict for y_test for element wise comparison
#print("prediction", y_predict.transpose() )
#print("y_test", y_test.to_numpy())

#compute accuracy of the model
print("Logistic Regression - MinMax Normalized")
evaluate(y_test, y_predict)
print("\nTo compare with sklearn classification_report")
print("--------------------------------------------")
print(classification_report(y_test, y_predict))


  after removing the cwd from sys.path.


Logistic Regression - MinMax Normalized

Confusion Matrix
----------------------------------------------------
			Predicted
Actual           Negative  Positive
Negative       681        90
Positive       135        90 

matches: 771 total number of samples: 996

Logistic Regression Performance
---------------------------------------------------
 accuracy  precision  recall  total number of samples
 0.774096        0.5     0.4                      996 


To compare with sklearn classification_report
--------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.88      0.86       771
           1       0.50      0.40      0.44       225

    accuracy                           0.77       996
   macro avg       0.67      0.64      0.65       996
weighted avg       0.76      0.77      0.76       996



In [15]:
#predict by logistic regression - raw data

#add bias column
x_test = np.append(np.ones((x_test.shape[0],1)), x_test, axis=1)          #append bias with all ones at the end of dataset
x_train = np.append(np.ones((x_train.shape[0],1)), x_train, axis=1)

#Initialize parameters
alpha=1
num_iters = 100
batch_size = 1
params = np.zeros((num_of_features+1,1))

#train the model by batch gradient descent
params, cost = logistic_regression_using_batch_gradient_descent(x_train, y_train_reshaped, params, alpha, num_iters, batch_size)

#test the model, get prediction
y_predict = Predict(params, x_test)
#print("prediction", y_predict.transpose())
#print("y_test", y_test.to_numpy())

#compute accuracy of the model
print("\nLogistic Regression - Raw")
evaluate(y_test, y_predict)
print("\nTo compare with sklearn classification_report")
print("--------------------------------------------")
print(classification_report(y_test, y_predict))


  after removing the cwd from sys.path.



Logistic Regression - Raw

Confusion Matrix
----------------------------------------------------
			Predicted
Actual           Negative  Positive
Negative       289       482
Positive        78       147 

matches: 436 total number of samples: 996

Logistic Regression Performance
---------------------------------------------------
 accuracy  precision   recall  total number of samples
 0.437751   0.233704 0.653333                      996 


To compare with sklearn classification_report
--------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.37      0.51       771
           1       0.23      0.65      0.34       225

    accuracy                           0.44       996
   macro avg       0.51      0.51      0.43       996
weighted avg       0.66      0.44      0.47       996



In [16]:
#predict by perceptron regression - raw data

#Initialize parameters
alpha=1
num_iters = 200
batch_size = 1
params = np.zeros((num_of_features+1,1))

y_prediction = []

#train the model 
perceptron1 = Perceptron(num_of_features, alpha)
perceptron1.fit(x_train, y_train, epochs = num_iters)

#test the model
for (x, target) in zip(x_test, y_test):
  pred = perceptron1.predict(x)
  y_prediction.append(pred)
#print("y_prediction",  y_prediction)

#compute accuracy of the model
print("\nPerceptron Regression - Raw")
evaluate(y_test, np.array(y_prediction))
print("\nTo compare with sklearn classification_report")
print("--------------------------------------------")
print(classification_report(y_test, y_prediction))


Perceptron Regression - Raw

Confusion Matrix
----------------------------------------------------
			Predicted
Actual           Negative  Positive
Negative       498       273
Positive       142        83 

matches: 581 total number of samples: 996

Logistic Regression Performance
---------------------------------------------------
 accuracy  precision   recall  total number of samples
 0.583333   0.233146 0.368889                      996 


To compare with sklearn classification_report
--------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.65      0.71       771
           1       0.23      0.37      0.29       225

    accuracy                           0.58       996
   macro avg       0.51      0.51      0.50       996
weighted avg       0.66      0.58      0.61       996



In [17]:
#predict by perceptron regression - normalized data

#Initialize parameters
alpha = 1
num_iters = 100
batch_size = 1
params = np.zeros((num_of_features+1,1))

y_normalized_prediction = []

#train the model 
perceptron2 = Perceptron(num_of_features, alpha)
perceptron2.fit(x_train_minmax_normalized, y_train, epochs = num_iters)

#test the model
for (x, target) in zip(x_test_minmax_normalized, y_test):
  pred = perceptron2.predict(x)
  y_normalized_prediction.append(pred)
#print("y_normalized_prediction",  y_normalized_prediction)

#compute accuracy of the model
print("\nPerceptron Regression - MinMax Normalized")
evaluate(y_test, np.array(y_normalized_prediction))
print("\nTo compare with sklearn classification_report")
print("------------------------------------------")
print(classification_report(y_test, y_normalized_prediction))


Perceptron Regression - MinMax Normalized

Confusion Matrix
----------------------------------------------------
			Predicted
Actual           Negative  Positive
Negative       725        46
Positive       155        70 

matches: 795 total number of samples: 996

Logistic Regression Performance
---------------------------------------------------
 accuracy  precision   recall  total number of samples
 0.798193   0.603448 0.311111                      996 


To compare with sklearn classification_report
------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.94      0.88       771
           1       0.60      0.31      0.41       225

    accuracy                           0.80       996
   macro avg       0.71      0.63      0.64       996
weighted avg       0.77      0.80      0.77       996



To compare the result of training and testing results of the 4 models, I have tried different values of hyper parameters, and here comes the most stable result -

logistic, normalized
learning rate = 1 and epochs = 200

logistic, raw
learning rate = 1 and epochs = 100 

preceptron, normalized 
learning rate = 1 and epochs = 100

preceptron, raw 
learning rate = 1 and epochs = 200

The number of epochs used here doesn't mean time to converge; I observed that, for example, if logistic regression of raw dataset takes more epochs, not only it doesn't help to get better accuacy, but also the model is overtrained and all prediction becomes 0.

Normalized dataset gives much higher accuracy.  From the above results, i observe 77% accuracy for logistic regression, and 80% for perceptron.  On the other hand, raw dataset can only produce 44% accuracy by logistics and 60% by perceptron.

This difference happens when features within a dataset have different ranges.  Result derivates more by larger values.  That's why we need normalization, to align the data range of individual features.

I have also tried to take one of the sets of the 4 models, logistic regression with normalized data, to verify my result with sklearn logisticregression function if my model is working well.  However, by using its default settings, the sklearn function returns even worse result (accuracy ~55%, v/s my model ~77%).  I observed that the difference is account for how sklearn handles imbalanced dataset; i have to put a "balanced" parameters.  Also the default values of hyper parameters could differ.

The results also show model performances - accuracy, precision, recall of perceptron regression is slightly better than logistics.  However, since both regressions are binary classifiers, the thresholds to put a prediction into 1 or 0 differ, logistics regression additionally brings the benefit of producing result with probabilities, but in all, in terms of accuracy, both regressions should not show a significant difference.