In [None]:
#ROC and AUC
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from numpy.linalg import inv

def load_data(Train=False):
    import csv
    data = []
    ## Read the training data
    f = open('spambase.data')
    reader = csv.reader(f)
    next(reader, None)
    for row in reader:
        data.append(row)
    f.close()
    ## x[:-1]: omit the last element of each x row
    X = np.array([x[:-1] for x in data]).astype(np.float)
    ## x[-1]: the first element from the right instead of from the left
    y = np.array([x[-1] for x in data]).astype(np.float)
    del data # free up the memory
    if Train:
        # returns X_train, X_test, y_train, y_test
        return train_test_split(X, y, test_size=0.2, random_state=8)
    else:
        return X, y

X_train, X_test,y_train,y_test = load_data(Train=True)
W = inv(X_train.T @ X_train) @ X_train.T @ y_train

y_pred = X_test @ W

## calculate classification error rate
yp_cls = [1 if y_out>=0.5 else 0 for y_out in y_pred]  #treshold 0.5
diff = np.abs(y_test-yp_cls)
test_error_count = (diff==1).sum()
test_error_rate = test_error_count/len(y_test)
print("Test error rate=",test_error_rate)

## part2: compute FPR and FNR at diff thresholds to plot ROC
pos_idx = np.where(y_test == 1)
neg_idx = np.where(y_test == 0)
y_pred_for_pos = y_pred[pos_idx]
y_pred_for_neg = y_pred[neg_idx]
#use shorter of 2 arrays as threshold
if (len(y_pred_for_pos) <= len(y_pred_for_neg)):
    sorted = np.sort(y_pred_for_pos)
else:
    sorted = np.sort(y_pred_for_neg)
FNR = []
FPR = []
TPR = []
# compute FNR FPR AND TPR for each threshold
for k in range(len(sorted)):
    yp_cls_pos = np.abs([1 if yout >= sorted[k] else 0 for yout in y_pred_for_pos])
    yp_cls_neg = np.abs([1 if yout >= sorted[k] else 0 for yout in y_pred_for_neg])
    FNR += [(yp_cls_pos ==0).sum() / len(y_pred_for_pos)]
    FPR += [(yp_cls_neg == 1).sum() / len(y_pred_for_neg)]
    TPR += [1-(yp_cls_pos == 0).sum() / len(y_pred_for_pos)]

ypos_array = [[1 if y_pred_for_pos[j] >= y_pred_for_neg[k] else 0 for j in range(len(y_pred_for_pos))] for k in range(len(y_pred_for_neg))]
AUC = np.sum(ypos_array)/(len(y_pred_for_pos)*len(y_pred_for_neg))
print('AUC',AUC)