# Logistic regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## Toy dataset

In [None]:
# load and plot data
X = np.loadtxt("data/logistic_regression/toy_logistic_X.csv", delimiter = ",")
y = np.loadtxt("data/logistic_regression/toy_logistic_y.csv", delimiter = ",")
plt.scatter(X[:,0], X[:,1], c=y)
plt.show()

In [None]:
len(X)

In [None]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

class LogisticRegression:
    def __init__ (self, num_features):
        self.weights = np.random.random_sample((num_features + 1,1))
        
    def fit(self, X, y, epochs, learning_rate):
        # NOTE: the matrix X must have 1s in the first column
        self.losses=[]

        N = X.shape[0]
                
        for i in range(epochs):
            # compute sigma(Xw)
            outputs = self.activation(X)
            residuals = outputs-y
            # update weights
            self.weights -= learning_rate/N*X.T @ residuals
            # compute the cost
            loss = (-y.T @ np.log(outputs)-(1 - y.T) @ np.log(1 - outputs))/N
            self.losses.append(loss.item())
        
    def activation(self, X):
        return sigmoid(X @ self.weights)   
        
    def predict(self, X):
        return np.where(self.activation(X)>=0.5,1,0)

In [None]:
# shuffling the dataset
data = np.c_[X,y]
np.random.shuffle(data)
num_features = X.shape[1]
X_shuffled = data[:,:num_features]
y_shuffled = data[:,num_features]

In [None]:
# standardization of the dataset
mean_X = np.mean(X_shuffled, axis=0)
std_X = np.std(X_shuffled, axis=0)
X_norm = (X_shuffled-mean_X)/std_X

In [None]:
# train-test split (70%-30%)
X_train=X_norm[:70]
X_test=X_norm[70:]
y_train=y_shuffled[:70].reshape(-1,1)
y_test=y_shuffled[70:].reshape(-1,1)

# adding column of ones
X_train = np.hstack((np.ones((X_train.shape[0],1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0],1)), X_test))

In [None]:
lr = LogisticRegression(num_features=2)
lr.fit(X=X_train, y=y_train, epochs=1000, learning_rate=0.1)

y_pred=lr.predict(X_test)

# evaluate accuracy
print('Misclassifications:',(y_test!=y_pred).sum())

In [None]:
# Ploting training loss vs epoch number
plt.plot(np.arange(1,len(lr.losses)+1,1),lr.losses)
plt.show()