In [332]:
import numpy as np
import pandas as pd

# from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'talk.religion.misc'])
vectorizer = CountVectorizer(stop_words="english", min_df=5)
vectors = np.asarray(vectorizer.fit_transform(train.data).todense())

In [363]:
def softmax(z):
    # Ensure numerical stability
    exp_scores = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_scores / exp_scores.sum(1)[:, np.newaxis]

def crossentropy(x, y):
    m = (x * y).sum(1)
    ce = np.log(m)
    return -ce.sum()

class SoftmaxRegression(object):
    def __init__(self, lr=0.1, epochs=10):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X, y):
        self.w = np.random.uniform(low=-1., high=1., size=(X.shape[1], y.shape[1]))
        self.b = np.random.uniform(low=-1., high=1., size=(y.shape[1]))

        for i in range(self.epochs):
            # print(self.w)
            z = X.dot(self.w) + self.b
            z = softmax(z)
            loss = crossentropy(z, y)

            # Calculate gradients
            dW = X.T.dot(z - y) / X.shape[0]
            db = z.sum(0) / X.shape[0]

            self.w = self.w - self.lr * dW
            self.b = self.b - self.lr * db
            if i % 10 == 0:
                print(loss/X.shape[0])
            # print(np.argmax(z, 1))

    def predict_proba(self, X):
        z = X.dot(self.w) + self.b
        z = softmax(z)
        return z

In [364]:
x = np.random.uniform(low=-1., high=1., size=(5, 6))
y = np.array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0]])

In [371]:
lr = SoftmaxRegression(epochs=1000)
# lr.fit(x, y)

In [372]:
# np.argmax(lr.predict_proba(x), 1)

In [373]:
# np.array([[1, 2], [3, 4]]) * np.array([[1, 2], [3, 4]])
lr.fit(vectors, np.asarray(OneHotEncoder().fit_transform(train.target.reshape(len(train.target), 1)).todense()))

5.03546971365427
3.4175330523841883
2.647567189826352
2.2184424578273405
1.884372715524855
1.6169406629514775
1.4187645814541465
1.260170453206355
1.1277254359774234
1.016027526664022
0.9194384093852871
0.8358569952135106
0.7635409939769153
0.6998854075131963
0.6434473648088366
0.5935493104605092
0.5494311106007653
0.5100912240026397
0.4746010843041485
0.4422383817523359
0.4124519613214714
0.38480666651597295
0.3590246488944772
0.33498486027959606
0.3126579526821375
0.2920092201114182
0.2729483948120506
0.25535680987583315
0.23913349587851876
0.22420937430871737
0.21052842103313893
0.19802648632527406
0.18663504678301873
0.1762964181934536
0.16695900999795363
0.15855453608030562
0.15098748774526327
0.14414535184739666
0.13791718841417497
0.1322079661891866
0.12694277608181903
0.12206429907582228
0.1175284981101827
0.11330074368095025
0.10935275682460421
0.105660294735684
0.10220154443262844
0.09895621162955405
0.09590518996155685
0.09303058437831363
0.0903158532394947
0.087745918107432

In [374]:
lr.predict_proba(vectors[:10]).argmax(1), train.target[:10]

(array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0]), array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0]))

In [321]:
lr.w

array([[-0.83719141],
       [ 0.00482592],
       [-0.01815983],
       ...,
       [-0.52647845],
       [ 0.01647992],
       [-0.72650787]])

In [337]:
# train.target.reshape(len(train.target), 1)
OneHotEncoder().fit_transform(train.target.reshape(len(train.target), 1)).todense()[:10]

matrix([[1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.]])