In [302]:
import numpy as np
import pandas as pd

# from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'talk.religion.misc'])
vectorizer = CountVectorizer(stop_words="english", min_df=5)
vectors = np.asarray(vectorizer.fit_transform(train.data).todense())

In [329]:
def softmax(z):
    # Ensure numerical stability
    exp_scores = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_scores / exp_scores.sum(1)[:, np.newaxis]

def crossentropy(x, y):
    ce = np.log((x * y).sum(1))
    return -ce.sum()

class SoftmaxRegression(object):
    def __init__(self, lr=0.1, epochs=10):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X, y):
        self.w = np.random.uniform(low=-1., high=1., size=(X.shape[1], y.shape[1]))
        self.b = np.random.uniform(low=-1., high=1., size=(y.shape[1]))

        for i in range(self.epochs):
            # print(self.w)
            z = X.dot(self.w) + self.b
            z = softmax(z)
            loss = crossentropy(z, y)

            # Calculate gradients
            dW = X.T.dot(z - y) / X.shape[0]
            db = z.sum(0) / X.shape[0]

            self.w = self.w - self.lr * dW
            self.b = self.b - self.lr * db
            if i % 10 == 0:
                print(loss/X.shape[0])
            # print(np.argmax(z, 1))

    def predict_proba(self, X):
        z = X.dot(self.w) + self.b
        z = softmax(z)
        return z

In [326]:
x = np.random.uniform(low=-1., high=1., size=(5, 6))
y = np.array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0]])

In [330]:
lr = SoftmaxRegression(epochs=200)
lr.fit(x, y)

[[0.22220086 0.19241954 0.15131995 0.04892512 0.16372816 0.22140637]
 [0.33497269 0.32903158 0.03410964 0.20551176 0.03646044 0.05991389]
 [0.28076857 0.44469807 0.0713154  0.11676161 0.06790137 0.01855498]
 [0.31102574 0.30325262 0.06207056 0.11841196 0.15803131 0.04720781]
 [0.19231823 0.16311174 0.20869488 0.04149031 0.30757542 0.08680941]]


In [319]:
np.argmax(lr.predict_proba(x), 1)

array([1, 2, 1, 0, 4])

In [331]:
lr.fit(vectors, train.target[:, np.newaxis])

[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.

In [324]:
lr.predict_proba(vectors[:5]), train.target[:5]

(array([[1.],
        [1.],
        [1.],
        [1.],
        [1.]]),
 array([0, 0, 0, 1, 1]))

In [321]:
lr.w

array([[-0.83719141],
       [ 0.00482592],
       [-0.01815983],
       ...,
       [-0.52647845],
       [ 0.01647992],
       [-0.72650787]])