# Comparative Study

## Importing Libraries

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from feature_engineering import *

warnings.filterwarnings('ignore')
%matplotlib inline

## Defining algorithms

### Perceptron

In [2]:
class Perceptron:
    def __init__(self, lr=0.001, epochs=10):
        self.lr = lr
        self.epochs = epochs
        self.weights = None

    def train(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)

        for _ in range(self.epochs):
            y_h=np.dot(X,self.weights)
            for idx, x_i in X.iterrows():
                y_hat = np.dot(x_i, self.weights)
                if y_hat*y[idx]<=0:
                    self.weights += x_i*y[idx]

    def predict(self, X):
        y_hat=np.dot(X, self.weights)
        return np.where(y_hat >= 0, 1, -1)

### Logistic Regression

In [3]:
class LogisticRegressionGD:
    def __init__(self, learning_rate=0.01, n_iters=1000, random_state=None,threshold=0.5):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.random_state = random_state
        self.threshold=threshold
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        self.costs = []
        
        # set random seed for reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        # gradient descent
        for i in range(self.n_iters):
            # calculate predicted probabilities and gradients
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_model)
            dw = np.dot(X.T, (y_pred - y)) / n_samples
            db = np.sum(y_pred - y) / n_samples
            
            # update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # calculate cost and add to list for graphing
            y_pred = self.sigmoid(np.dot(X, self.weights) + self.bias)
            y_pred[y_pred == 0] = 1e-15  # add small constant value to avoid NaN in cost
            y_pred[y_pred == 1] = 1 - 1e-15  # add small constant value to avoid NaN in cost
            cost = -1/n_samples * np.sum(y * np.log(y_pred) + (1-y) * np.log(1-y_pred))
            self.costs.append(cost)
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(linear_model)
        y_pred_class = [1 if i > self.threshold else 0 for i in y_pred]
        return y_pred_class
    
    def plot_cost(self):
        fig, ax = plt.subplots(figsize=(10, 8))
        plt.plot(np.arange(1, len(self.costs)+1), self.costs)
        plt.xlabel('Iterations')
        plt.ylabel('Cost')
        plt.title('Gradient Descent Cost Graph')
        plt.show()
        fig.savefig('unormalizedLR_graphs/'+"GD"+str(len(str(self.learning_rate)))+str(self.threshold)[-1]+".png")

In [4]:
class LogisticRegressionSGD:
    def __init__(self, learning_rate=0.01, n_iters=1000, batch_size=1, random_state=None,threshold=0.5):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.batch_size = batch_size
        self.random_state = random_state
        self.threshold=threshold
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        self.costs = []
        
        # set random seed for reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        # stochastic gradient descent
        for i in range(self.n_iters):
            # shuffle data
            idx = np.arange(n_samples)
            np.random.shuffle(idx)
            X_shuffled = X[idx]
            y_shuffled = y[idx]
            
            # loop over batches
            for j in range(0, n_samples, self.batch_size):
                # get mini-batch
                X_batch = X_shuffled[j:j+self.batch_size]
                y_batch = y_shuffled[j:j+self.batch_size]
                
                # calculate predicted probabilities and gradients
                linear_model = np.dot(X_batch, self.weights) + self.bias
                y_pred = self.sigmoid(linear_model)
                dw = np.dot(X_batch.T, (y_pred - y_batch)) / self.batch_size
                db = np.sum(y_pred - y_batch) / self.batch_size
                
                # update weights and bias
                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db
            
            # calculate cost and add to list for graphing
            y_pred = self.sigmoid(np.dot(X, self.weights) + self.bias)
            y_pred[y_pred == 0] = 1e-15  # add small constant value to avoid NaN in cost
            y_pred[y_pred == 1] = 1 - 1e-15  # add small constant value to avoid NaN in cost
            cost = -1/n_samples * np.sum(y * np.log(y_pred) + (1-y) * np.log(1-y_pred))
            self.costs.append(cost)
    
    def predict_proba(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(linear_model)
        return y_pred
    
    def predict(self, X):
        y_pred_proba = self.predict_proba(X)
        y_pred_class = [1 if i > self.threshold else 0 for i in y_pred_proba]
        return y_pred_class
    
    def plot_cost(self):
        fig, ax = plt.subplots(figsize=(10, 8))
        plt.plot(np.arange(1, len(self.costs)+1), self.costs)
        plt.xlabel('Iterations')
        plt.ylabel('Cost')
        plt.title('Stochastic Gradient Descent Cost Graph')
        plt.show()
        fig.savefig('unormalizedLR_graphs/'+"SGD"+str(len(str(self.learning_rate)))+str(self.threshold)[-1]+".png")

In [5]:
class LogisticRegressionMiniBatchGD:
    def __init__(self, learning_rate=0.01, n_iters=1000, batch_size=32, random_state=None,threshold=0.5):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.batch_size = batch_size
        self.random_state = random_state
        self.threshold=threshold
        
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        # initialize weights and bias to zero
        self.weights = np.zeros(X.shape[1])
        self.bias = 0
        
        # initialize costs list for storing costs at each iteration
        self.costs = []
        
        # set random seed for reproducibility
        n_samples, n_features = X.shape
        if self.random_state is not None:
            np.random.seed(self.random_state)
            
        # minibatch gradient descent
        for i in range(self.n_iters):
            # shuffle data
            idx = np.arange(X.shape[0])
            np.random.shuffle(idx)
            X = X[idx]
            y = y[idx]
            
            # loop over batches
            for j in range(0, X.shape[0], self.batch_size):
                # get minibatch
                X_batch = X[j:j+self.batch_size]
                y_batch = y[j:j+self.batch_size]
                
                # calculate predicted probabilities and gradients
                y_pred = self.sigmoid(np.dot(X_batch, self.weights) + self.bias)
                dw = np.dot(X_batch.T, (y_pred - y_batch)) / self.batch_size
                db = np.sum(y_pred - y_batch) / self.batch_size
                
                # update weights and bias
                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db
            
            # calculate cost and add to list for graphing
            y_pred = self.sigmoid(np.dot(X, self.weights) + self.bias)
            y_pred[y_pred == 0] = 1e-15  # add small constant value to avoid NaN in cost
            y_pred[y_pred == 1] = 1 - 1e-15  # add small constant value to avoid NaN in cost
            cost = -1/n_samples * np.sum(y * np.log(y_pred) + (1-y) * np.log(1-y_pred))
            self.costs.append(cost)
            
        return self
        
    def predict_proba(self, X):
        return self.sigmoid(np.dot(X, self.weights) + self.bias)
    
    def predict(self, X):
        return np.where(self.predict_proba(X) > self.threshold, 1, 0)
    
    def plot_cost(self):
        fig, ax = plt.subplots(figsize=(10, 8))
        plt.plot(range(1, len(self.costs) + 1), self.costs)
        plt.xlabel('Iteration')
        plt.ylabel('Cost')
        plt.title('Logistic Regression Cost Graph')
        plt.show()
        fig.savefig('unormalizedLR_graphs/'+"minibatchGD"+str(len(str(self.learning_rate)))+str(self.threshold)[-1]+".png")

## Comparison

### Data Cleaning
    

In [24]:
for i in range(1, 11):
    df = pd.read_csv("../data/dataset1.csv")
    df=df.drop('id',axis=1)
    df_imputed = df.copy()
    mean = np.mean(df_imputed, axis=0)
    for j in range(1, df_imputed.shape[1]):
        df_imputed.iloc[:, j].fillna(mean[j-1], inplace=True)
    print(f"Result of random test: {i}")
    X_train, X_test, y_train, y_test = train_test_split(df_imputed, shuffle = True, random_state = 11*i)
    
    # Models that do not require normalisation
    # PM1
#     PM1 = Perceptron(lr = 0.01, epochs = 500)
#     PM1.train(X_train, y_train)
#     y_pred1 = PM1.predict(X_test)
#     accuracy_pm1 = evaluate(y_test, y_pred1, verbose = False)
#     print(f"PM1: {accuracy_pm1}%")
    # FLDM1
    fldm = LinearDiscriminantAnalysis(n_components=1)
    fldm.fit(X_train, y_train)
    # project the training data onto the 1-dimensional FLDM space
    X_train_lda = fldm.transform(X_train)
    mean_pos = np.mean(X_train_lda[y_train == 1])
    mean_neg = np.mean(X_train_lda[y_train == -1])
    std_pos = np.std(X_train_lda[y_train == 1])
    std_neg = np.std(X_train_lda[y_train == -1])
    threshold = (mean_pos + mean_neg) / 2
    X_test_lda = fldm.transform(X_test)
    # evaluate the performance of the model on the testing data
    y_pred = np.where(X_test_lda > threshold, 1, -1)
    accuracy_fldm1 = evaluate(y_test, y_pred, verbose = False)
    print(f"FLDM1: {accuracy_fldm1}%")
    
    # LR1
    df = pd.read_csv("../data/dataset1.csv")
    df = df.drop('id',axis=1)
    df_imputed = df.copy()
    mean = np.mean(df_imputed, axis=0)
    for j in range(1, df_imputed.shape[1]):
        df_imputed.iloc[:, j].fillna(mean[j-1], inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(df_imputed, shuffle = True, random_state = 11*i)
    print(y_test)
    y_train = y_train.map({"M": 1, "B": 0})
    y_test = y_test.map({"M": 1, "B": 0})
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()
    ## With gradient descent and threshold as 0.4 and learning reate=0.001
    model = LogisticRegressionGD(learning_rate=0.001, n_iters=1000,threshold=0.4)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy_GD (0.4, 0.001):", accuracy)
    ## With SGD and threshold as 0.5 and learning reate=0.001
    model = LogisticRegressionSGD(learning_rate=0.001, n_iters=1000,threshold=0.5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = evaluate(y_test, y_pred, verbose = False)
    print("Accuracy_SGD (0.5, 0.001):", accuracy)
    ## With gradient descent and threshold as 0.5 and learning reate=0.0001
    model = LogisticRegressionMiniBatchGD(learning_rate=0.0001, n_iters=1000,threshold=0.5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = evaluate(y_test, y_pred, verbose = False)
    print("Accuracy_MGD (0.5, 0.0001):", accuracy)
    
    # Normalised
    mean = np.mean(X, axis=0)
    stddev = np.std(X, axis=0)
    y = df_imputed['diagnosis']
    X = df_imputed.drop(['diagnosis'], axis = 1)
    X_normalised = (X.copy() - mean) / stddev
    X_train, X_test, y_train, y_test = train_test_split(df_imputed, shuffle = True, random_state = 11*i)
    
    print("------")
    

Result of random test: 1
FLDM1: 4.787234042553192%
47     1
472   -1
302    1
405   -1
120   -1
      ..
332   -1
269   -1
337    1
91     1
80    -1
Name: diagnosis, Length: 188, dtype: int64


ValueError: Input y_true contains NaN.