In [45]:
import numpy as np
import pandas as pd
from prettytable import PrettyTable 

def load_data(file_path):
    data = pd.read_csv(file_path, sep=",")
    x = data.iloc[:, 1:-1].values
    y = data.iloc[:, -1].values
    return x, y

class LR():
    def __init__(self, x, y):
        self.bias = np.ones((x.shape[0],1))
        self.x = np.concatenate((self.bias, x), axis=1)
        self.y = y
        self.theta = np.zeros(self.x.shape[1])

    def sigmoid(seslf, x, theta):
        z = np.dot(x, theta)
        probs = 1 / (1 + np.exp(-z))
        return probs
    
    #print loss for training 
    def loss(self, y_hat, y):
        loss = ((-y * np.log(y_hat)) - (1 - y) * np.log(1- y_hat)).mean()
        print("---------")
        print(loss)
 

    def gradient_descent(self, learning_rate, iterations):
        for i in range(iterations):
            y_hat = self.sigmoid(self.x, self.theta)
            # self.loss(y_hat, self.y)
            gradiant = np.dot(self.x.T, (y_hat - self.y)) / self.y.shape[0]
            self.theta -= learning_rate * gradiant
        return self.theta
    
x, y = load_data("emails.csv")


# folds = [[1,20], [1000,1020], [2000, 2020], [3000, 3020], [4000, 4020]]
folds = [[0,1000], [999,2000], [1999, 3000], [2999, 4000], [3999, 5000]]

table = PrettyTable()
table.field_names = ["Fold Name" , "Accuracy" , "Precision" , "Recall"]

for val in folds:
    
    train_x = x[val[0]:val[1]]
    train_y = y[val[0]:val[1]]
    test_x = np.concatenate((x[:val[0], :], x[val[1]:, :]), axis=0)
    bias = np.ones((test_x.shape[0],1))
    test_x = np.concatenate((bias, test_x), axis=1)
    test_y = np.concatenate((y[:val[0]], y[val[1]:]), axis=0)

    lra = LR(train_x, train_y)

    theta = lra.gradient_descent(.0001, 9000)

    # Got info to better use numpy here:
    # https://saturncloud.io/blog/compute-precision-and-accuracy-using-numpy-a-comprehensive-guide-for-data-scientists/
    predictions = (lra.sigmoid(test_x, theta) >= 0.5)
    predictions = predictions * 1
    true_positives = np.sum((predictions == 1) & (test_y == 1))
    false_positives = np.sum((predictions == 1) & (test_y == 0))
    false_negatives = np.sum((predictions == 0) & (test_y == 1))
    accuracy = np.mean(predictions == test_y)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    fold_name = "Emails: " + str(val[0]) + " - " + str(val[1])
    table.add_row([fold_name, accuracy, precision, recall])
    print(table)



+------------------+----------+--------------------+--------------------+
|    Fold Name     | Accuracy |     Precision      |       Recall       |
+------------------+----------+--------------------+--------------------+
| Emails: 0 - 1000 | 0.85825  | 0.8382687927107062 | 0.6339362618432386 |
+------------------+----------+--------------------+--------------------+
+--------------------+--------------------+--------------------+--------------------+
|     Fold Name      |      Accuracy      |     Precision      |       Recall       |
+--------------------+--------------------+--------------------+--------------------+
|  Emails: 0 - 1000  |      0.85825       | 0.8382687927107062 | 0.6339362618432386 |
| Emails: 999 - 2000 | 0.8747186796699175 | 0.8450413223140496 | 0.699743370402053  |
+--------------------+--------------------+--------------------+--------------------+
+---------------------+--------------------+--------------------+--------------------+
|      Fold Name      |    