In [1]:
# library imports 

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss
import numpy as np

In [2]:
# regression data
import random
from sklearn.datasets import make_classification

# rng
random.seed(0)

regression_params = {
     'n_samples':1000
    ,'n_features':5
    ,'n_informative':3
    ,'n_redundant':0
    ,'n_repeated':0
    ,'n_classes' : 4
    ,'n_clusters_per_class':1
    ,'random_state':0
    ,'class_sep' : .2
}

X,y = make_classification(**regression_params)

# add constant
X = np.concatenate([X,np.ones(shape=(len(X),1))],axis=1)

# train / test splits
X_train,X_test,y_train,y_test = train_test_split(X,y)

# statsmodels

In [3]:
import statsmodels.api as sm

In [4]:
sm_model = sm.MNLogit(endog=y_train,exog=X_train)

In [5]:
result = sm_model.fit_regularized(maxiter=1000)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 1.2979110352548542
            Iterations: 41
            Function evaluations: 42
            Gradient evaluations: 41


In [6]:
print(result.summary())

                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  750
Model:                        MNLogit   Df Residuals:                      732
Method:                           MLE   Df Model:                           15
Date:                Mon, 12 Jul 2021   Pseudo R-squ.:                 0.06350
Time:                        23:42:28   Log-Likelihood:                -973.43
converged:                       True   LL-Null:                       -1039.4
Covariance Type:            nonrobust   LLR p-value:                 8.549e-21
       y=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.0736      0.118     -0.623      0.534      -0.305       0.158
x2            -0.6398      0.119     -5.377      0.000      -0.873      -0.407
x3             0.0038      0.165      0.023      0.9

In [8]:
# accuracy
(np.argmax(result.predict(X_test),axis=1)== y_test).mean()

0.249168

In [8]:
# roc auc
roc_auc_score(y_test,result.predict(X_test),multi_class='ovr')

0.6342416954959569

# sklearn

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
sk_model = LogisticRegression(multi_class='ovr')

In [11]:
sk_model = sk_model.fit(X_train[:,0:-1],y_train)

In [12]:
sk_model.coef_

array([[-0.24273733,  0.49877012,  0.14298191, -0.05762091, -0.01488683],
       [-0.44486711, -0.33088683,  0.29485706,  0.1011516 , -0.11013981],
       [ 0.47264283, -0.42620622, -0.90189888,  0.02458309,  0.04647151],
       [ 0.18386022,  0.29208942,  0.50020298, -0.06325093,  0.0862855 ]])

In [13]:
sk_model.intercept_

array([-1.28722084, -1.18702092, -1.08529674, -1.18713244])

In [14]:
# accuracy
(sk_model.predict(X_test[:,0:-1]) == y_test).mean()

0.332

In [15]:
# roc auc
roc_auc_score(y_test,sk_model.predict_proba(X_test[:,0:-1]),multi_class='ovr')

0.6336440287789685

# pure python

In [16]:
# turn back into numpy arrays

X_train = X_train
y_train = y_train.reshape(-1,1)

In [17]:
# functions

def sigmoid(x):
    return 1/(1+np.exp(-x))

def predict(x,beta):
    return sigmoid(np.dot(x,beta))

def gradient_step(beta,grad,step_size=0.01):
    return beta - step_size*grad

def gradient(x,beta,y_true):
    y_pred = predict(x,beta)
    
    return np.dot(np.transpose(x),(y_pred-y_true))/len(x)

In [18]:
betas = np.zeros(shape=(regression_params['n_features']+1,regression_params['n_classes']))
for _class in np.unique(y):
    betas[:,_class] = [random.random() for _ in range(len(betas))]
betas

array([[0.84442185, 0.78379859, 0.28183784, 0.81021724],
       [0.7579544 , 0.30331273, 0.7558042 , 0.90216595],
       [0.42057158, 0.47659695, 0.618369  , 0.31014757],
       [0.25891675, 0.58338204, 0.25050634, 0.72983175],
       [0.51127472, 0.90811289, 0.90974626, 0.89883829],
       [0.40493414, 0.50468686, 0.98278548, 0.68398393]])

In [19]:
epochs = 100000
for _class in np.unique(y):
    beta = betas[:,_class].reshape(-1,1)
    for epoch in range(epochs):
        grad = gradient(x=X_train,beta=beta,y_true=(y_train==_class).astype('int'))
        beta = gradient_step(beta=beta,grad=grad,step_size=0.1)
    betas[:,_class] = beta.flat[:]

In [20]:
betas

array([[-0.24626354, -0.44874129,  0.47884721,  0.18508572],
       [ 0.50415341, -0.33289014, -0.43301242,  0.29571018],
       [ 0.14740864,  0.29966882, -0.9211596 ,  0.50934533],
       [-0.05815588,  0.10208814,  0.02501376, -0.06385692],
       [-0.01488167, -0.11102331,  0.04645941,  0.0873864 ],
       [-1.28919999, -1.18879233, -1.0875383 , -1.18955563]])

In [21]:
# accuracy
(np.argmax(predict(X_test,betas),axis=1) == y_test).mean()

0.332

In [22]:
# roc auc
roc_auc_score(y_test,(predict(X_test,betas) /predict(X_test,betas).sum(axis=1).reshape(-1,1)),multi_class='ovr')

0.6338012319513205

# pytorch

In [23]:
import torch
import torch.nn

In [91]:
class Classifier(torch.nn.Module):
    def __init__(self,n_inputs,n_outputs):
        super(Classifier,self).__init__()
        self.__clf = torch.nn.Linear(n_inputs,n_outputs)
        self.__sigmoid = torch.nn.Sigmoid()
        
    def forward(self,x):
        return self.sigmoid(self.clf(x.float()))
    
    @property
    def clf(self):
        return self.__clf
    
    @property
    def sigmoid(self):
        return self.__sigmoid

In [25]:
X_train = torch.tensor(X_train[:,0:-1])
y_train = torch.tensor(y_train)

In [92]:

torch_model = Classifier(regression_params['n_features'],regression_params['n_classes'])

criterion = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(torch_model.parameters(),lr=0.01)


# training loop 
for epoch in range(epochs):
    y_pred = torch_model(X_train)
    loss = criterion(y_pred,y_train.flatten().long())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [93]:
torch_model.clf.weight

Parameter containing:
tensor([[-0.5484,  0.8844,  0.6768, -0.0490, -0.0575],
        [-0.5946,  0.1113,  0.7362,  0.1349, -0.0635],
        [ 0.4666,  0.1795, -0.9384,  0.0354, -0.0313],
        [ 1.1347,  0.8827,  0.7538, -0.0802,  0.0055]], requires_grad=True)

In [94]:
torch_model.clf.bias

Parameter containing:
tensor([5.1242, 5.2960, 5.3538, 5.0804], requires_grad=True)

In [95]:
(np.argmax(torch_model(torch.tensor(X_test[:,0:-1])).detach().numpy(),axis=1) == y_test).mean()

0.324

In [96]:
roc_auc_score(y_test,(torch_model(torch.tensor(X_test[:,0:-1])).detach().numpy() / torch_model(torch.tensor(X_test[:,0:-1])).detach().numpy().sum(axis=1).reshape(-1,1)),multi_class='ovr')

0.6500989680735798