# Goal:

- provide a plaintext interface to analyze step-by-step what is happening in the encrypted code

- Used as a Python sanity check because I'm not that familiar with R.

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
import tqdm
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from numba import njit

np.seterr(all='raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

# Load and Process the Data

In [17]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def load_data(num_samples, compare_to_r_ref):
    x_file = "../cscore_data/X_train_79999.csv"
    y_file = "../cscore_data/y_train_79999.csv"
    train_x = pd.read_csv(x_file)
    train_x = train_x.to_numpy()[:num_samples]
    train_y = pd.read_csv(y_file)

    class_counts = train_y['0'].value_counts()
    print(class_counts)
    
    # If it's a binary classification, calculate the ratio
    positive_class = class_counts[1.0]  
    negative_class = class_counts[0.0]  
    ratio = positive_class / negative_class
    
    print(f"Positive class: {positive_class}, Negative class: {negative_class}, Ratio: {ratio:.2f}")
      
    train_y = train_y.to_numpy()[:num_samples]
    print(f"{bcolors.OKGREEN}Using subsampled data to compare Python-C++{bcolors.ENDC}")
    print(f"{bcolors.OKGREEN}Reading in {x_file}, {y_file} {bcolors.ENDC}")

    print(f"Train X shape is: {train_x.shape}")
    print(f"Train y shape is: {train_y.shape}")
    return train_x, train_y

In [18]:
NUM_SAMPLES = -1
COMPARE_TO_R_REF = False
lr = 0.1
mu = 0.1
train_x, train_y = load_data(
    num_samples=NUM_SAMPLES,
    compare_to_r_ref=COMPARE_TO_R_REF
)

# Same shape as Marcelo's reference code
betas = np.zeros((train_x.shape[1], ))

0
0.0    65619
1.0    14380
Name: count, dtype: int64
Positive class: 14380, Negative class: 65619, Ratio: 0.22
[92mUsing subsampled data to compare Python-C++[0m
[92mReading in ../cscore_data/X_train_79999.csv, ../cscore_data/y_train_79999.csv [0m
Train X shape is: (79998, 47)
Train y shape is: (79998, 1)


In [4]:
train_x[0, :5]

array([-0.34660179,  0.39375239,  0.50834598,  0.51504625,  0.49324492])

In [5]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def fwd(train_x, betas, dbg=False):
    preds = train_x @ betas   # A vector of linear_predictions/logits z = train_x @ weights
    if dbg:
        print(f"Logits: {preds}")
    return np.expand_dims(sigmoid(preds), -1)   # Shape: (m, 1)

def calculate_gradient(train_x, train_y, betas, fwd, dbg):
    preds = fwd(train_x, betas, dbg)   # A vector of logistic predictions y_hat = sigmoid(z)
    gradient = -train_x.T @ (train_y - preds) / len(train_y)
    return gradient   # Shape: (10, 1) == Rows correspond to num_features
    ## This function is used to update the values of betas: w_new = w_old + lr * gradient

def cost(x, y, theta):
    m = x.shape[0]
    h = sigmoid(np.matmul(x, theta))   # h: hypothesis, basically preds/y_hat
    t1 = np.matmul(-y.T, np.log(h))
    t2_a = (1 - y.T)
    t2_b = np.log(np.clip(1 - h, 0.000000000000001, np.max(1 - h)))  # Used to get numerical issues
    ## np.clip() function prevents computing the log of 0, by taking the minimum of 1e-15.
    t2 = np.matmul(t2_a, t2_b)

    return ((t1 - t2) / m)[0]   # Shape: (1,) == scalar value

def nesterov(betas, epochs, lr, mu, train_x, train_y):
    import copy

    phi = copy.deepcopy(betas)
    theta = copy.deepcopy(betas)

    nesterov_loss = [0 for _ in range(epochs)]
    for i in tqdm.trange(epochs):
    # for i in range(epochs):
        gradient = calculate_gradient(train_x, train_y, theta, fwd, dbg=False)

        ## Assign updated weights into phi_prime
        phi_prime = theta - lr * np.squeeze(gradient)   # np.squeeze() removes single dimensions --> shape (10,)
        
        ## Nesterov acceleration process
        if i == 0:
            theta = phi_prime
        else:
            ## If current updated weight (phi_prime) < previous weight (phi), 
            ## The updated weight theta will be even smaller.
            theta = phi_prime + mu * (phi_prime - phi)
        phi = phi_prime   # phi is then the weight of the previous epoch/update
        loss = cost(train_x, train_y, theta)
        nesterov_loss[i] = loss

        # print(f"New loss: {cost(train_x, train_y, v)[0]}")
    return nesterov_loss, theta, phi


In [6]:
losses, theta, phi = nesterov(betas, 800, lr, mu, train_x, train_y)

100%|██████████| 800/800 [00:15<00:00, 50.47it/s]


In [7]:
losses

## zero initialized betas    : cost = 0.4087381183990924
## uniform initialized betas : cost = 0.40867096414830645
## randn initialized betas   : cost = 0.40850863604124726

[0.6662323813166761,
 0.6411364358498033,
 0.6195921462287043,
 0.6009751580190279,
 0.5846597713843376,
 0.5701755131700676,
 0.5571761201892295,
 0.545404258076202,
 0.5346656071953942,
 0.5248107624933068,
 0.5157226737712236,
 0.5073079027453625,
 0.4994904944125395,
 0.4922076418968398,
 0.48540658786456725,
 0.4790423844488243,
 0.4730762540148046,
 0.46747437411489834,
 0.4622069646886174,
 0.4572475926995109,
 0.45257263477359483,
 0.4481608558581578,
 0.4439930740121531,
 0.44005188987623084,
 0.43632146529798443,
 0.4327873397812352,
 0.42943627641479976,
 0.42625613107790655,
 0.4232357402658341,
 0.4203648240038131,
 0.4176339011408337,
 0.41503421492289216,
 0.412557667197485,
 0.41019675994056903,
 0.4079445430542241,
 0.40579456757976484,
 0.40374084362283164,
 0.4017778024054631,
 0.3999002619536441,
 0.3981033960034246,
 0.39638270576889734,
 0.3947339942644169,
 0.39315334291393855,
 0.39163709021406007,
 0.39018181224571585,
 0.388784304853523,
 0.38744156733235213,


In [8]:
theta

array([ 0.00511047, -0.02048277,  0.03658464, -0.01540358,  0.06815236,
       -0.10549571, -0.11352108, -0.30990752, -0.04039454, -0.01195469,
       -0.06367125, -0.06494156,  0.02578636,  0.29112837,  0.11156849,
       -0.00176102, -0.10224106, -0.12549213, -0.14863883, -0.28420054,
       -0.32384622, -0.13514462, -0.18454341, -0.32314794, -0.12773604,
       -0.55239605,  0.31482715, -1.14186052, -0.27332031, -0.32744592,
       -0.15897204, -1.16633176, -0.18247895, -0.09632728, -0.12689165,
       -0.1766913 , -0.23959264, -0.309848  , -0.5209199 , -0.12504368,
        0.02636882,  0.05140282, -0.30081261, -0.04350681, -0.60804676,
       -0.5763761 , -0.46832686])

In [9]:
pred = fwd(train_x, theta, dbg=False)

## Decision (Threshold = 0.5)
train_y_hat = (pred >= 0.5).astype(int)

In [10]:
train_y_hat.shape

(79998, 1)

In [11]:
train_y.shape

(79998, 1)

In [12]:
confusion_matrix(train_y, train_y_hat)

array([[61787,  3831],
       [ 9333,  5047]], dtype=int64)

In [13]:
roc_auc_score(train_y, pred)

0.8567060897118912

In [14]:
accuracy_score(train_y, train_y_hat)

0.8354458861471536

In [15]:
f1_score(train_y, train_y_hat)

0.43400120388683466