In [895]:
'''
CS6140 Assignment 2
Question 3 - Logistic Regression and Perceptron
Jun 3 2022
'''

'\nCS6140 Assignment 2\nQuestion 3 - Logistic Regression and Perceptron\nJun 3 2022\n'

In [896]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split    #Library to split training and testing data
from sklearn.metrics import classification_report       #Library to compute accuracy, precison and recall

In [897]:
## Read the dataset

In [898]:
def load_data(data_dir):
    ''' data: input features
        labels: output features
    '''
    dataset = pd.read_csv(data_dir)  
    
    features_names = dataset.columns.values[:-1]        #Equipvalent to feature_cols = ['feature1', 'feature2', 'feature3', 'feature4']
    features_data = dataset[features_names] 
    labels_name = dataset.columns.values[-1]   
    labels_data = dataset[labels_name]

    return features_names, features_data, labels_name, labels_data
    
    #return features_names, labels_name

In [899]:
#Part 1

In [900]:
def sigmoid(z):
    """The sigmoid function."""
    sig = 1.0/(1.0+np.exp(-z))
    sig = np.minimum(sig, 0.9999)  # Set upper bound
    sig = np.maximum(sig, 0.0001)  # Set lower bound
    return sig

In [901]:
## Implement the loss function for logistic regression

def compute_cost(ip, op, params):
    """
    Cost function in linear regression where the cost is calculated
    ip: input variables
    op: output variables
    params: corresponding parameters
    Returns cost
    """
   
    num_of_samples = len(op)

    y_hat = sigmoid(np.dot(ip, params))
    #nll = sum((-op * np.log(y_hat)) - ((1- op)*np.log(1-y_hat))) + alpha/2 * np.dot(params.transpose(), params)
    nll = sum((-op * np.log(y_hat)) - ((1- op)*np.log(1-y_hat)))  
    cost = nll
    gradient = np.dot(ip.transpose(), (y_hat - op))  

    '''
    print("y_hat", y_hat)
    print("params", params)
    print("gradient", gradient)
    print("cost", cost)
    print("\n")
    '''
    
#     print (cost_sum)
    return cost, gradient

In [902]:
#Part 2

In [903]:
def logistic_regression_using_batch_gradient_descent(ip, op, params, alpha, num_iter, batch_size = 1):
    """
    Compute the params for logistic regression using batch gradient descent
    ip: input variables
    op: output variables
    params: corresponding parameters
    alpha: learning rate
    max_iter: maximum number of iterations
    Returns parameters, cost, params_store
    """ 
    # initialize iteration, number of samples, cost and parameter array
    pass
    
    #batchify the data into mini-batches
    pass
    
    # Compute the cost and store the params for the corresponding cost
    cost_list = []
    for i in range(num_iter):
        cost, gradient = compute_cost(ip, op, params)
        params = params - (alpha * gradient)
        cost_list.append(cost)
    
    return params, cost_list

In [904]:
#Version 1 : normalized data by z-score
def featureNormalization_zscore(X):
    """
    Take in numpy array of X values and return normalize X values,
    the mean and standard deviation of each feature
    """
    mean=np.mean(X,axis=0)
    std=np.std(X,axis=0)
    
    X_norm = (X - mean)/std
    
    return X_norm , mean , std

In [905]:
#Version 2 : normalized data by min max normalization
def featureNormalization_minmax(X):
    """
    Take in numpy array of X values and return normalize X values,
    the mean and standard deviation of each feature
    """
    mean=np.mean(X,axis=0)
    min=np.min(X, axis=0)
    max=np.max(X, axis=0)
    
    X_norm = (X - min)/(max - min)
    std = 0
    
    return X_norm , mean , std

In [906]:
#Logistic Regression predictor
def classifierPredict(theta,X):
    """
    take in numpy array of theta and X and predict the class 
    """
    predictions = X.dot(theta)
    
    '''
    print("X", X)
    print("predictions", predictions)
    '''
    return np.where(predictions >= 0.5, 1, 0)

In [907]:
#Part 3

In [908]:
class Perceptron:
  # constructor 
  def __init__ (self, num_of_x_features, alpha):
    #self.weight = np.random.randn(num_of_x_features) / np.sqrt(num_of_x_features)
    #self.W = np.random.randn(num_of_x_features) / np.sqrt(num_of_x_features)
    self.W = np.zeros((num_of_features+1))
    self.b = None
    self.alpha = alpha

  # model
  def step(self, x):
    return 1 if x > 0 else 0

  # fitting the model
  def fit(self, X, y, epochs): 
    error = 0
    for epoch in range(epochs):
      for (x, target) in zip(X, y):
        y_hat = self.step(np.dot(x, self.W))
        if y_hat != target:
          error += (target - y_hat) * x
      self.W += self.alpha * error
          
  '''
  #SGD
  def fit(self, X, y, epochs): 

    for epoch in range(epochs):
      #error = 0
      for (x, target) in zip(X, y):
        y_hat = self.step(np.dot(x, self.W))

        if y_hat != target:
          error = target - y_hat
          self.W += self.alpha * error * x
      #print("self.W", self.W)
  '''

      
  # predictor to predict on the data based on weight
  def predict(self, X):
    return self.step(np.dot(X, self.W))


In [909]:
def evaluate(y_test, y_predict):
  '''return the accuracy scores'''
  
  #compute accuracy
  y_test_array = y_test.to_numpy()    #convert dataframe to numpy array
  total_num_samples = y_test.shape[0]

  matching_count = np.count_nonzero(y_test_array == y_predict)
  accuracy = matching_count/total_num_samples
  print("matching:", matching_count, "total:", total_num_samples)

  #compute precision
  true_positive = np.sum((y_test_array == 1) & (y_predict == 1))
  false_positive = np.sum((y_test_array == 0) & (y_predict == 1))
  false_negative = np.sum((y_test_array == 1) & (y_predict == 0))
  precision = true_positive/(true_positive + false_positive)

  print("true_positive", true_positive)
  print("false_positive", false_positive)
  print("false_negative", false_negative)
  
  #compute recall
  recall = true_positive/(true_positive + false_negative)

  #put computation result into dataframe
  performance_matrix = pd.DataFrame({"accuracy:":[accuracy], "precision":[precision], "recall":[recall], "total number of samples":[total_num_samples]})
  print("\nperformance"); print(performance_matrix.to_string(index=False), "\n")

  return

In [910]:
#Part 4

In [911]:
#Train the model
#reserve the test data, do not use them for cross-validation!

#load data
DATA_DIR = "/content/drive/My Drive/Colab Notebooks/CS6140 Assignment2/default of credit card clients2.csv"
features_names, features_data, labels_name, labels_data = load_data(DATA_DIR)

#split data
x_train, x_test, y_train, y_test = train_test_split(features_data, labels_data, test_size=0.2, random_state = 42) #x_train datatype <class 'pandas.core.frame.DataFrame'>

#setup numpy print environment
np.set_printoptions(suppress=True)      #with np set to printoptions, the printout would not be in scientific e to the power of x format

In [912]:
#predict by logistic regression - normalized data
num_of_features = x_train.shape[1]      #x_train.shape (3984, 25)

#normalize data
x_train_minmax_normalized, x_train_mean, x_train_std = featureNormalization_minmax(x_train)
x_test_minmax_normalized, x_test_mean, x_test_std = featureNormalization_minmax(x_test)

#add bias column
x_train_minmax_normalized = np.append(np.ones((x_train_minmax_normalized.shape[0],1)), x_train_minmax_normalized, axis=1)       #x_train datatype <class 'numpy.ndarray'>, append bias with all ones at the end of dataset
x_test_minmax_normalized = np.append(np.ones((x_test_minmax_normalized.shape[0],1)), x_test_minmax_normalized, axis=1)       #x_train datatype <class 'numpy.ndarray'>, append bias with all ones at the end of dataset

#reshape true label for calculation
y_train_reshaped = y_train.values.reshape(x_train.shape[0],1)
y_test_reshaped = y_test.values.reshape(x_test.shape[0],1)

#Initialize parameters
alpha=1
num_iters = 200
batch_size = 1
params = np.zeros((num_of_features+1,1))  #initialize params as 0 by rows as num of x_train columns, and 1 single column

#train the model - run batch gradient descent
params, cost = logistic_regression_using_batch_gradient_descent(x_train_minmax_normalized, y_train_reshaped, params, alpha, num_iters, batch_size)

#test the model by prediction
#y_normalized_logistic_prediction = []
y_predict = classifierPredict(params, x_test_minmax_normalized).transpose()   #has to transpose y_predict for y_test element wise comparison
print("prediction", y_predict )
print("y_test", y_test.to_numpy())

#compute accuracy of the model
print("manual calculation")
evaluate(y_test, y_predict)
print("sklearn classification report")
print(classification_report(y_test, classifierPredict(params, x_test_minmax_normalized)))


  This is separate from the ipykernel package so we can avoid doing imports until


prediction [[0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0
  0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0
  0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
  0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
  0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0

In [913]:
#predict by logistic regression - raw data

#add bias column
x_test = np.append(np.ones((x_test.shape[0],1)), x_test, axis=1)          #append bias with all ones at the end of dataset
x_train = np.append(np.ones((x_train.shape[0],1)), x_train, axis=1)

#Initialize parameters
alpha=1
num_iters = 100
batch_size = 1
params = np.zeros((num_of_features+1,1))

#train the model - run batch gradient descent
params, cost = logistic_regression_using_batch_gradient_descent(x_train, y_train_reshaped, params, alpha, num_iters, batch_size)

#test the model by prediction
y_predict = classifierPredict(params, x_test).transpose() 
p = classifierPredict(params, x_test)
print("prediction", y_predict)
print("y_test", y_test.to_numpy())

#compute accuracy of the model
evaluate(y_test, y_predict)
print(classification_report(y_test,p))


  This is separate from the ipykernel package so we can avoid doing imports until


prediction [[1 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1
  1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 0 0 0 1 1 1
  0 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 1
  1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 0
  0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0
  1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 0 0 0 1 1 1 0 1 1 1 1 1
  0 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0
  0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 0
  0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 1 1
  0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0
  1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 1 0 0 1
  0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 0 1 1 0 0 0 1 1 1 1 0 1 1 1 1 1
  0 1 1 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 0 1 0
  0 0 0 0 0 1 1 1 1 0 0 0 1

In [914]:
#predict by perceptron regression - raw data

#Initialize parameters
alpha=1
num_iters = 100
batch_size = 1
params = np.zeros((num_of_features+1,1))

y_prediction = []

#train the model 
perceptron1 = Perceptron(num_of_features, alpha)
perceptron1.fit(x_train, y_train, epochs = num_iters)

#test the model
for (x, target) in zip(x_test, y_test):
  pred = perceptron1.predict(x)
  y_prediction.append(pred)
	
print("y_prediction",  y_prediction)

#compute accuracy of the model
evaluate(y_test, y_predict)
print(classification_report(y_test, y_prediction))

y_prediction [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [915]:
#predict by perceptron regression - normalized data

#Initialize parameters
alpha=1
num_iters = 100
batch_size = 1
params = np.zeros((num_of_features+1,1))

y_normalized_prediction = []

#train the model 
perceptron2 = Perceptron(num_of_features, alpha)
perceptron2.fit(x_train_minmax_normalized, y_train, epochs = num_iters)

#test the model
for (x, target) in zip(x_test_minmax_normalized, y_test):
  pred = perceptron2.predict(x)
  y_normalized_prediction.append(pred)

print("y_normalized_prediction",  y_normalized_prediction)

#compute accuracy of the model
evaluate(y_test, y_predict)
print(classification_report(y_test, y_normalized_prediction))

y_normalized_prediction [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 