In [3]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from scipy import linalg

In [9]:
df = pd.read_csv("~/Stats243Project/Data/cs-training.csv", index_col = 0)
print(len(df))
df = df.dropna()
print(len(df))
X_cols = list(df.columns)
X_cols.remove("SeriousDlqin2yrs")
X = df[X_cols].to_numpy()
y = df["SeriousDlqin2yrs"].to_numpy()

150000
120269


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fisher LDA

In [11]:
mu = np.mean(X_train)
X_train_demeaned = (X_train - mu).T
X_test_demeaned = (X_test - mu).T
S_t = np.cov(X_train_demeaned)
S_w = np.zeros(S_t.shape)
for c in np.unique(y_train):
    S_w += np.cov(X_train_demeaned[:, y_train == c])

S_b = S_t - S_w


vals, vecs = linalg.eig(np.linalg.inv(S_w)@S_b)
vecs = vecs[:, np.argsort(vals)]
W_lda = vecs[:, -1:].real

X_train_lda = (W_lda.T@X_train_demeaned).T
X_test_lda = (W_lda.T@X_test_demeaned).T

print(np.mean(X_train_lda[y_train == 0]), np.mean(X_train_lda[y_train == 1]))
print(np.mean(X_test_lda[y_test == 0]), np.mean(X_test_lda[y_test == 1]))


85.44373421739633 83.43138114569865
85.49241455143215 83.98570707600669


# Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter = 1000).fit(X_train, y_train)
probs = clf.predict_proba(X_train)
score = probs[:, 1]/(1 - probs[:, 1])
print(np.mean(score[y_train == 0]), np.mean(score[y_train == 1]))

probs = clf.predict_proba(X_test)
score = probs[:, 1]/(1 - probs[:, 1])
print(np.mean(score[y_test == 0]), np.mean(score[y_test == 1]))

0.08939710399711638 0.31580477039617605
0.08139228896473245 0.2668209060192212


# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_train)
score = probs[:, 1]/(1 - probs[:, 1])
print(np.mean(score[y_train == 0]), np.mean(score[y_train == 1]))

probs = clf.predict_proba(X_test)
score = probs[:, 1]/(1 - probs[:, 1])
print(np.mean(score[y_test == 0]), np.mean(score[y_test == 1]))

0.07023938518560487 0.39612069415283807
0.07135186453471327 0.3791356430665598


# Dense NN

In [14]:
model_in = keras.Input(shape = (10,))
x = layers.Dense(5, activation= "relu")(model_in)
x = layers.Dense(2, activation= "relu")(x)
out = layers.Dense(1, activation= "sigmoid")(x)

model = keras.Model(model_in, out)
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(X_train, y_train, epochs=50,
                batch_size=8,
                shuffle=True)

Epoch 1/50


2022-03-16 14:52:06.902127: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fc7b43e3fd0>

In [15]:
probs = model.predict(X_train)
score = probs/(1 - probs)
print(np.mean(score[y_train == 0]), np.mean(score[y_train == 1]))

0.075107925 0.075107925
