In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

### Logistic Regression on Default dataset

In [2]:
df = pd.read_csv('Default.csv')
print(df.default.value_counts())
print(df.student.value_counts())
X = df[['balance','income']]
y = np.array([1 if (i =='Yes') else 0 for i in df['default'] ]).reshape((X.shape[0],))

No     9667
Yes     333
Name: default, dtype: int64
No     7056
Yes    2944
Name: student, dtype: int64


In [3]:
df.head(3)

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#### Scale the features to make our result stable

#from sklearn.preprocessing import OneHotEncoder
# ohe = OneHotEncoder()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [5]:
### Black Box Logistic Regression with no penalty

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty = 'none',random_state=0)
model.fit(X_train,y_train)

model.coef_

model.intercept_

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[2880,    7],
       [  75,   38]])

## Newton's method

$$ \beta_{t+1} = \beta_{t} - H^{-1}(f(\beta_t))\nabla(f(\beta_t)) $$ 

In [6]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [7]:
iters = 10000
beta = np.zeros((3,1))
coef_vec = np.ones(len(X_train))

In [8]:
X_train = np.c_[coef_vec,X_train]

In [9]:
precision = 0.00001
loss_old = 1
for i in range(iters):
    # compute loss and the gradient
    loss = - (y_train.T @ np.log(sigmoid(X_train @ beta)) + (1-y_train.T) @ np.log(1-sigmoid(X_train @ beta)))
    gradient = X_train.T @ (sigmoid(X_train@beta)-y_train.reshape((-1,1)))
    
    # D is the diagnoal matrix in the middle of Hessian matrix Xt@D@X
    D = np.diag([float(sigmoid(beta.T@X_train[i,:].T)*(1-sigmoid(beta.T@X_train[i,:].T))) for i in range(len(X_train))])
    
    # compute hessian using D and X
    hessian = X_train.T @ D @ X_train
    
    # update step for Newton's method
    beta = beta - np.linalg.inv(hessian) @ gradient
    if abs(loss_old - loss) < precision:
        break
    else:
        loss_old = loss # store the loss and previous loss.
    print(loss)

[4852.03026392]
[1335.3488728]
[835.74356785]
[640.39047902]
[561.93210997]
[540.48959727]
[538.04848538]
[538.00381926]
[538.00380074]


In [10]:
beta # Coefficients obtained from our newton method

array([[-6.12225178],
       [ 2.66840103],
       [ 0.23775349]])

In [11]:
print(np.c_[model.intercept_,model.coef_].T) # Coefficients obtained from sklearn

[[-6.12225801]
 [ 2.66840546]
 [ 0.2377525 ]]


 Despite Newton's method requires intensive calculation (involving the hessian matrix), it converges quite fats given that initial starting values are zero for the coefficients. By just running for 9 iterations, this algorithm can achieve almost same coeficients as the black box results. 