In [1]:
pwd


'/Users/anhnguyen/Documents/GitHub/2025AAE722_AnhNguyen/2025 AAE722 Anh Nguyen Submission'

In [2]:
from ISLP import load_data
import pandas as pd

import statsmodels.api as sm

# Load the Default dataset
default_data = load_data('Default')

#print the first few rows of the dataset
print(default_data.head())

# Examine the structure of the dataset
print("Dataset Dimensions:", default_data.shape)
print("\nColumn Names and Data Types:")
print(default_data.dtypes)
print("\nDistribution of 'default' variable:")
print(default_data['default'].value_counts())

# Prepare the data for logistic regression
default_data['student'] = default_data['student'].map({'Yes': 1, 'No': 0})
default_data['default'] = default_data['default'].map({'Yes': 1, 'No': 0})

X = default_data[['income', 'balance', 'student']]
X = sm.add_constant(X)  # Add constant for the intercept
y = default_data['default']

# Fit the logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Report the coefficient for balance
balance_coef = result.params['balance']
print("\nCoefficient for balance:", balance_coef)
print("\nInterpretation: For a one-unit increase in balance, the log-odds of defaulting increase by", balance_coef)

  default student      balance        income
0      No      No   729.526495  44361.625074
1      No     Yes   817.180407  12106.134700
2      No      No  1073.549164  31767.138947
3      No      No   529.250605  35704.493935
4      No      No   785.655883  38463.495879
Dataset Dimensions: (10000, 4)

Column Names and Data Types:
default    category
student    category
balance     float64
income      float64
dtype: object

Distribution of 'default' variable:
default
No     9667
Yes     333
Name: count, dtype: int64
Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10

Coefficient for balance: 0.00573650526579908

Interpretation: For a one-unit increase in balance, the log-odds of defaulting increase by 0.00573650526579908


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    default_data[['income', 'balance']], default_data['default'], test_size=0.3, random_state=42
)

# Fit the Linear Discriminant Analysis (LDA) model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Report class means and prior probabilities for LDA
print("LDA Class Means:")
print(lda.means_)
print("\nLDA Prior Probabilities:")
print(lda.priors_)

# Generate predictions on the test set for LDA
y_pred_lda = lda.predict(X_test)

# Create confusion matrix and calculate test accuracy for LDA
conf_matrix_lda = confusion_matrix(y_test, y_pred_lda)
accuracy_lda = accuracy_score(y_test, y_pred_lda)
print("\nLDA Confusion Matrix:")
print(conf_matrix_lda)
print("\nLDA Test Accuracy:", accuracy_lda)

# Fit the Quadratic Discriminant Analysis (QDA) model
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

# Generate predictions on the test set for QDA
y_pred_qda = qda.predict(X_test)

# Create confusion matrix and calculate test accuracy for QDA
conf_matrix_qda = confusion_matrix(y_test, y_pred_qda)
accuracy_qda = accuracy_score(y_test, y_pred_qda)
print("\nQDA Confusion Matrix:")
print(conf_matrix_qda)
print("\nQDA Test Accuracy:", accuracy_qda)


LDA Class Means:
[[33681.79366744   802.15837363]
 [31570.35768985  1768.16582059]]

LDA Prior Probabilities:
[0.96585714 0.03414286]

LDA Confusion Matrix:
[[2900    6]
 [  75   19]]

LDA Test Accuracy: 0.973

QDA Confusion Matrix:
[[2898    8]
 [  70   24]]

QDA Test Accuracy: 0.974


  ret = a @ b
  ret = a @ b
  ret = a @ b


In [4]:
from sklearn.naive_bayes import GaussianNB

# Fit Gaussian Naive Bayes using the same training split (income and balance)
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predictions on the test set
y_pred_nb = nb.predict(X_test)

# Confusion matrix and accuracy for Naive Bayes
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

print("Naive Bayes Confusion Matrix:")
print(conf_matrix_nb)
print("\nNaive Bayes Test Accuracy:", accuracy_nb)


# Compare with LDA and QDA
print("\nLDA Test Accuracy:", accuracy_lda)
print("QDA Test Accuracy:", accuracy_qda)
if accuracy_nb > accuracy_lda and accuracy_nb > accuracy_qda:
    print("Naive Bayes has the highest test accuracy.")
elif accuracy_nb < accuracy_lda and accuracy_nb < accuracy_qda:
    print("Naive Bayes has the lowest test accuracy.")
else:
    print("Naive Bayes accuracy is between LDA and QDA (or ties).")

# Predicted probability of default for a customer with income=40000 and balance=2000
customer = [[40000, 2000]]  # columns: [income, balance]
proba = nb.predict_proba(customer)
# find probability for class '1' (default)
if 1 in list(nb.classes_):
    idx_default = list(nb.classes_).index(1)
    proba_default = proba[0, idx_default]
else:
    proba_default = None

print("\nPredicted class probabilities (order = {}):".format(list(nb.classes_)))
print(proba)
print("\nPredicted probability of default (class=1) for income=40000, balance=2000:", proba_default)

Naive Bayes Confusion Matrix:
[[2893   13]
 [  75   19]]

Naive Bayes Test Accuracy: 0.9706666666666667

LDA Test Accuracy: 0.973
QDA Test Accuracy: 0.974
Naive Bayes has the lowest test accuracy.

Predicted class probabilities (order = [np.int64(0), np.int64(1)]):
[[0.48919461 0.51080539]]

Predicted probability of default (class=1) for income=40000, balance=2000: 0.5108053902363209




In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define K values to test
k_values = [1, 3, 5, 10]
accuracy_knn = {}

# Fit and evaluate KNN models for each K value
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    accuracy_knn[k] = knn.score(X_test_scaled, y_test)

# Create a summary table
accuracy_table = pd.DataFrame(list(accuracy_knn.items()), columns=['K', 'Test Accuracy'])
print(accuracy_table)

# Identify the best K
best_k = max(accuracy_knn, key=accuracy_knn.get)
print(f"\nBest K: {best_k} with Test Accuracy: {accuracy_knn[best_k]}")

# Explanation for small K values
print("\nExplanation: Very small values of K, such as K=1, may not be optimal because they are highly sensitive to noise in the training data. This can lead to overfitting, where the model performs well on the training set but poorly on unseen data.")

    K  Test Accuracy
0   1       0.953667
1   3       0.966333
2   5       0.969667
3  10       0.972000

Best K: 10 with Test Accuracy: 0.972

Explanation: Very small values of K, such as K=1, may not be optimal because they are highly sensitive to noise in the training data. This can lead to overfitting, where the model performs well on the training set but poorly on unseen data.


In [6]:
# Refit Logistic Regression on training data
logit_model_refit = sm.Logit(y_train, sm.add_constant(X_train))
result_refit = logit_model_refit.fit()

# Predict on the test set
y_pred_logit = result_refit.predict(sm.add_constant(X_test))
y_pred_logit_class = (y_pred_logit >= 0.5).astype(int)

# Calculate test accuracy for Logistic Regression
accuracy_logit = accuracy_score(y_test, y_pred_logit_class)

# Create a summary table
methods = ['Logistic Regression', 'LDA', 'QDA', 'Naive Bayes', f'KNN (K={best_k})']
accuracies = [accuracy_logit, accuracy_lda, accuracy_qda, accuracy_nb, accuracy_knn[best_k]]

summary_table = pd.DataFrame({'Method': methods, 'Test Accuracy': accuracies})
print(summary_table)

Optimization terminated successfully.
         Current function value: 0.078256
         Iterations 10
                Method  Test Accuracy
0  Logistic Regression       0.973333
1                  LDA       0.973000
2                  QDA       0.974000
3          Naive Bayes       0.970667
4           KNN (K=10)       0.972000


In [7]:
# Compute false negative rates from the confusion matrices in the notebook
cms = {
    'LDA': conf_matrix_lda,
    'QDA': conf_matrix_qda,
    'Naive Bayes': conf_matrix_nb
}

fnr = {}
for name, cm in cms.items():
    tn, fp, fn, tp = cm.ravel()
    fnr[name] = fn / (fn + tp) if (fn + tp) > 0 else float('nan')

for name, rate in fnr.items():
    print(f"{name}: FNR = {rate:.4f}")

best = min(fnr, key=fnr.get)
print(f"\nLowest false negative rate: {best} (FNR = {fnr[best]:.4f})")

LDA: FNR = 0.7979
QDA: FNR = 0.7447
Naive Bayes: FNR = 0.7979

Lowest false negative rate: QDA (FNR = 0.7447)


In [8]:
# Identify the method with the lowest FNR
best_method = min(fnr, key=fnr.get)
lowest_fnr = fnr[best_method]

print(f"The recommended method is {best_method} because it has the lowest false negative rate (FNR = {lowest_fnr:.4f}).")
print("This minimizes the cost of missing a default, which is 10 times higher than a false alarm.")

The recommended method is QDA because it has the lowest false negative rate (FNR = 0.7447).
This minimizes the cost of missing a default, which is 10 times higher than a false alarm.


In [9]:
# Evaluate QDA with a lowered probability threshold (0.3) and compare to the default (0.5)
proba_qda = qda.predict_proba(X_test)
idx_pos = list(qda.classes_).index(1)
probs_pos = proba_qda[:, idx_pos]

# Predictions at threshold 0.3
y_pred_qda_03 = (probs_pos >= 0.3).astype(int)

# Confusion matrix and rates for threshold = 0.3
cm_03 = confusion_matrix(y_test, y_pred_qda_03)
tn_03, fp_03, fn_03, tp_03 = cm_03.ravel()
fpr_03 = fp_03 / (fp_03 + tn_03) if (fp_03 + tn_03) > 0 else float('nan')
fnr_03 = fn_03 / (fn_03 + tp_03) if (fn_03 + tp_03) > 0 else float('nan')

# Baseline (threshold = 0.5) using existing conf_matrix_qda
tn_05, fp_05, fn_05, tp_05 = conf_matrix_qda.ravel()
fpr_05 = fp_05 / (fp_05 + tn_05) if (fp_05 + tn_05) > 0 else float('nan')
fnr_05 = fn_05 / (fn_05 + tp_05) if (fn_05 + tp_05) > 0 else float('nan')

print("QDA (threshold=0.5): FP =", int(fp_05), "FN =", int(fn_05), f"FPR = {fpr_05:.4f}", f"FNR = {fnr_05:.4f}")
print("QDA (threshold=0.3): FP =", int(fp_03), "FN =", int(fn_03), f"FPR = {fpr_03:.4f}", f"FNR = {fnr_03:.4f}")

print("\nChange when lowering threshold 0.5 -> 0.3:")
print("ΔFP =", int(fp_03 - fp_05), "ΔFN =", int(fn_03 - fn_05),
    f"ΔFPR = {fpr_03 - fpr_05:+.4f}", f"ΔFNR = {fnr_03 - fnr_05:+.4f}")

QDA (threshold=0.5): FP = 8 FN = 70 FPR = 0.0028 FNR = 0.7447
QDA (threshold=0.3): FP = 48 FN = 53 FPR = 0.0165 FNR = 0.5638

Change when lowering threshold 0.5 -> 0.3:
ΔFP = 40 ΔFN = -17 ΔFPR = +0.0138 ΔFNR = -0.1809
