<a href="https://colab.research.google.com/github/Youssef-ElBakry/Logistic-Regression-ML/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Introduction
Logistic Regression

The dataset: The dataset used in this model is from kaggle. It is a set of data that holds 4 values about the physical appearance of a banknote per row:
- variance
- skewness
- curtosis
- entropy

 These variables can be used to predict whether a banknote is real or forged. This data was collected using an industrial camera and so while technically not a toy data set, it is fairly simple like a toy dataset.

More information about the dataset can be found here: https://www.kaggle.com/datasets/shanks0465/banknoteauthentication/data


The model used in following program is logistic regression. This model predicts a binary value, in this instance, real or forged based on a set of numerical values.




In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Open Dataset
url = "https://raw.githubusercontent.com/Youssef-ElBakry/Logistic-Regression-ML/main/data_banknote_authentication.csv"
cols = ["Variance", "Skewness", "Curtosis", "Entropy", "Class"] #Adding headers to file as dataset has no headers
df = pd.read_csv(url, header=None, names=cols)
df["Class"] = df["Class"].astype(int)
print("classes:", df["Class"].value_counts().to_dict())
print("unique:", df["Class"].unique())
print(df)

#Define global values
trainingRatio = 0.8
LabelCol = "LABEL"
seed = 450 #Fixed seed for rng as it makes results reproducable


#Split data into training and testing
#80/20 used as there is no hyperparameter tuning and therefore no need for validation
rng = np.random.default_rng(seed)
test_I = []
train_I = []

#Shuffle and sort data entries into training and testing
for key, row in df.groupby("Class", sort="False"):
  index = row.index.to_numpy()
  rng.shuffle(index)
  trainingRows = int(round(len(index) * trainingRatio)) #Determine size of training set in each class. This allows for a more even split. (80% of the spam class and 80% of the ham class)
  train_I.extend(index[:trainingRows])
  test_I.extend(index[trainingRows:])

#Shuffle training and testing set again as they are currently grouped by class
train = df.loc[train_I].sample(frac=1, random_state=seed).reset_index(drop=True)
test = df.loc[test_I].sample(frac=1, random_state=seed).reset_index(drop=True)

FEATS = ["Variance", "Skewness", "Curtosis", "Entropy"]

X_train = train[FEATS].to_numpy(dtype=np.float64)
y_train = train["Class"].to_numpy(dtype=np.float64)
X_test  = test[FEATS].to_numpy(dtype=np.float64)
y_test  = test["Class"].to_numpy(dtype=np.float64)

#Standardise data using mean and standardisation to find z-scores
mu = X_train.mean(axis=0) #Calculate mean of each column
sd = X_train.std(axis=0) #Calculate standard deviation for each column

X_train_std = (X_train - mu) / sd #Turn each value in each column to a zscore
X_test_std = (X_test - mu) / sd

#Add a column of ones, to allow for intercept
def add_bias(X):
  return np.c_[np.ones((X.shape[0], 1)), X]  # shape: (N, D+1)

Xtr = add_bias(X_train_std)   # (n_train, 1+4)
Xte = add_bias(X_test_std)

#Sigmoid function
def sigmoid(z):
  #Cuts off z so that it's value its >500 or <-500 as those numbers to the power of e is an unnecessarily large number
  z = np.clip(z, -500, 500)
  return 1.0 / (1.0 + np.exp(-z))

def logloss_and_grad(W, X, y):
  #Get number of features in array
  N = X.shape[0]

  z = X @ W   #Matrix multiplication of array and weights
  p = sigmoid(z)    #Map each score to a point in the sigmoid graph

  # log loss (cross-entropy)
  eps = 1e-12   #avoid log(0) by setting adding a very small number
  loss = -np.mean(y*np.log(p + eps) + (1 - y)*np.log(1 - p + eps))

  # gradient of the loss w.r.t. W
  err  = p - y    #Difference between actual value and predicted value
  grad = (X.T @ err) / N  #Vector form for finding gradient/average error

  return loss, grad

def fit_logreg(X, y, lr=0.1, epochs=2000, verbose=True, seed=450):
    rng = np.random.default_rng(seed)
    W = rng.normal(scale=0.01, size=X.shape[1])   # small random init, shape (D,)
    for t in range(epochs):
        loss, grad = logloss_and_grad(W, X, y)
        W -= lr * grad #Gradient decent
        if verbose and (t % 200 == 0 or t == epochs - 1):
            print(f"epoch {t:4d}  loss={loss:.4f}")
        if not np.isfinite(loss):
            raise RuntimeError("Loss exploded/NaN. Lower lr (e.g., 0.05 or 0.01).")
    return W

W = fit_logreg(Xtr, y_train, lr=0.1, epochs=2000, verbose=True, seed=450)

def predict_proba(X, W):
    return sigmoid(X @ W)

def predict_label(X, W, thresh=0.5):
    return (predict_proba(X, W) >= thresh).astype(int)

proba = predict_proba(Xte, W)
pred  = predict_label(Xte, W)

# metrics (binary)
tp = int(((pred == 1) & (y_test == 1)).sum())
tn = int(((pred == 0) & (y_test == 0)).sum())
fp = int(((pred == 1) & (y_test == 0)).sum())
fn = int(((pred == 0) & (y_test == 1)).sum())

acc  = (tp + tn) / (tp + tn + fp + fn)
prec = tp / (tp + fp) if (tp + fp) else 0.0
rec  = tp / (tp + fn) if (tp + fn) else 0.0
f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0

print(f"\nTest Accuracy={acc:.3f}  Precision={prec:.3f}  Recall={rec:.3f}  F1={f1:.3f}")

# inspect learned weights
feat_names = ["(bias)", "Variance", "Skewness", "Curtosis", "Entropy"]
for name, w in zip(feat_names, W):
    print(f"{name:>10s}: {w:+.4f}")

classes: {0: 762, 1: 610}
unique: [0 1]
      Variance  Skewness  Curtosis  Entropy  Class
0      3.62160   8.66610   -2.8073 -0.44699      0
1      4.54590   8.16740   -2.4586 -1.46210      0
2      3.86600  -2.63830    1.9242  0.10645      0
3      3.45660   9.52280   -4.0112 -3.59440      0
4      0.32924  -4.45520    4.5718 -0.98880      0
...        ...       ...       ...      ...    ...
1367   0.40614   1.34920   -1.4501 -0.55949      1
1368  -1.38870  -4.87730    6.4774  0.34179      1
1369  -3.75030 -13.45860   17.5932 -2.77710      1
1370  -3.56370  -8.38270   12.3930 -1.28230      1
1371  -2.54190  -0.65804    2.6842  1.19520      1

[1372 rows x 5 columns]
epoch    0  loss=0.6940
epoch  200  loss=0.1969
epoch  400  loss=0.1322
epoch  600  loss=0.1050
epoch  800  loss=0.0899
epoch 1000  loss=0.0803
epoch 1200  loss=0.0736
epoch 1400  loss=0.0686
epoch 1600  loss=0.0647
epoch 1800  loss=0.0616
epoch 1999  loss=0.0590

Test Accuracy=0.982  Precision=0.961  Recall=1.000  F1=0.9