In [85]:
import numpy as np
from tqdm.notebook import tqdm

# Linear Models with Logarithmic Loss

Consider the following protocol: for $t=1,2,...,T$, nature chooses $x_t \in \mathbb{R}^d$ in each round, and then reveals a noisy label $y_t \sim \text{Ber}(x_t^\top \theta^\star)$, where $\theta^\star \in \mathbb{R}^d$ is the unknown parameter. Our goal is to learn to make accurate predictions given a linear model class and any $x \in \mathbb{R}^d$, i.e., to predict $y^\star(x) = x^\top \theta^\star$. The classical way of learning $\theta^\star$ given i.i.d samples of $\{x_t,y_t\}_{t=1}^T$ is to perform ridge regression.

Ridge regression considers the following estimator and prediction: given $\{x_t,y_t\}_{t=1}^T$ compute:

$$
\hat\theta_T := \arg\min_{\theta \in \mathbb{R}^d} \sum_{t=1}^T \left( x_t^\top \theta - y_i\right)^2 + 
\lVert \theta \rVert^2
$$

and set $\hat y_T(x) := x^\top \hat\theta_T$. This is simply least-sqaures regression with $\ell_2$ regularization. The estimator and the prediciton can also be solved in closed form. Define $V_t := I + \sum_{t=1}^T x_t x_t^\top$, then we have:

$$
\hat y_T(x) = x^\top V_t^{-1} \sum_{t=1}^T x_t y_t.
$$

Another way of learning $\theta^\star$  given i.i.d samples of $\{x_t,y_t\}_{t=1}^T$ is to perform linear regression with the negative logarithmic (cross-entropy) loss. 

Linear regression with negative logarithmic loss considers the following estimator and prediciton: given $\{x_t,y_t\}_{t=1}^T$ compute:

$$
\hat\theta_T := \arg\max_{\theta \in \mathbb{R}^d} \mathcal{L}(\{x_t,y_t\}_{t=1}^T;\theta) = \arg\max_{\theta \in \mathbb{R}^d}  \sum_{t=1}^T y_t \log(x_t^\top \theta) + (1-y_t)\log(1-x_t^\top \theta)
$$
and set $\hat y_T(x) := x^\top \hat\theta_T$. The estimaotr and the prediciton cannot be solved in closed form, however the loss function defined above is concave in its argument. Therefore approximate solvers like Newtown's method or Gradient Descent, should be sufficient for finding the $\theta$ that minimizes the negative logarithmic loss. Taking the first derivative of the loss $\mathcal{L}$ we get:2*

$$
\frac{\partial}{\partial \theta}  \mathcal{L}(\{x_t,y_t\}_{t=1}^T;\theta) = g_T(\theta) :=  \sum_{t=1}^T \left(\frac{y_t}{x_t^\top \theta} - \frac{1-y_t}{1-x_t^\top \theta}\right) x_t - 2\theta.
$$

Taking the second derivative of $\mathcal{L}$ gives:

$$
\frac{\partial^2}{\partial \theta^2}  \mathcal{L}(\{x_t,y_t\}_{t=1}^T;\theta)= H_T(\theta) :=  \sum_{t=1}^T -\left(\frac{y_t}{(x_t^\top\theta)^2} + \frac{1-y_t}{(1-x_t^\top\theta)^2}\right) x_t x_t^\top - I.
$$

Notice that we are trying to solve for the zeros of $g_T(\theta)$. This function has three zeros at $\theta = \{-\infty, \arg\max_{\theta \in \mathbb{R}^d} \mathcal{L}(\{x_t,y_t\}_{t=1}^T;\theta), \infty\}$. Therefore, we need to constraint our optimization routine otherwise our estimate of $\theta$ may diverge.

One idea is to use Frank-Wolfe to ensure $\theta$ does not diverge off to infinity. 





Thus we can solve:

$$
\hat\theta_T := \arg\min_{\theta \in \mathbb{R}^d} - \mathcal{L}(\{x_t,y_t\}_{t=1}^T;\theta)
$$

by applying Newton's method for $k=1,2,...,n$, i.e.

$$
\theta_{k+1} = \theta_k - \left(H_T^{-1}(\theta_k)\right)^\top g_t(\theta_k).
$$

where $\theta_0$ is initialized apprioately, i.e. $\theta_0 \sim \mathcal{N}(0,I)$. Setting $\hat\theta_T = \theta_n$ gives us our estimator and letting $\hat y_T(x) = x^\top \hat\theta_T$ gives us our predictor. 

In [86]:
d = 2
num_features = 4
theta_star = np.random.uniform(size=d)
theta_star = theta_star / (10*np.linalg.norm(theta_star))
X = np.random.uniform(size=(num_features,d))
for i in range(num_features):
    X[i] = X[i] / np.linalg.norm(X[i])
X[0] = X[0] / 1000
X[1] = X[1] / 900
X[2] = X[2] / 800
X[3] = X[3] / 700


In [87]:
num_samples = 1000**2
features = np.zeros((num_samples,d))
tar = np.zeros(num_samples)
for i in tqdm(range(num_samples)):
    j = i % num_features
    feature = X[j]
    mean = np.inner(X[j],theta_star)
    obs = np.random.binomial(1,p=mean) - 1/2 
    features[i] = feature
    tar[i] = obs

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [65]:
class LinearLogRegression(object):
    def __init__(self, features, obs, d, newton_iters,theta_star):
        self.features = features
        self.obs = obs
        self.n = len(obs)
        self.newton_iters = newton_iters
        self.d = d
        self.theta_star = theta_star
        self.step_size = 1/self.newton_iters
    
    
    def get_hessian_loss(self,theta):
        H = np.zeros((self.d,self.d))
        g = np.zeros(self.d)
        for i in range(self.n):
            y = self.obs[i]
            x = self.features[i]
            inner = np.inner(x,self.theta)
            if inner == 1:
                inner = 0.9999
            elif inner == 0:
                inner = 0.0001
            if y == 0:
                mu =   (1 - y) / ((1 - inner) ** 2)
                H +=  -1.0 *mu * np.outer(x,x)

                g += -1.0*( (1 - y) / ((1 - inner))) * x
            else:
                mu = y / (inner ** 2) 
                H +=  -1.0 * mu * np.outer(x,x)

                g += (y / (inner) ) * x
        return H, g
    
    def Newton_Method(self):
        self.theta = np.zeros(self.d) + 0.3
        for k in tqdm(range(self.newton_iters)):
            H,g = self.get_hessian_loss(self.theta)
            self.update = np.linalg.solve(H,g)
            self.theta = self.theta - self.step_size/(k+1) * self.update
            if k % 1000 == 999:
                print('Log Loss', np.linalg.norm(self.theta - self.theta_star))
            
    
    
    def Least_Squares(self):
        A = np.zeros((self.d,self.d))
        b = np.zeros(self.d)
        for i in range(self.n):
            x = self.features[i]
            y = self.obs[i]
            A = A + np.outer(x,x)
            b = b + y * x
        self.theta_ls = np.linalg.solve(A,b)
        print('Least Squares:' , np.linalg.norm(self.theta_ls - self.theta_star))
    
            
        
        

In [73]:
num_data = 50000
print(np.dot(X,reg.theta_ls))
reg = LinearLogRegression(features[:num_data],tar[:num_data],d,10000,theta_star)
reg.Least_Squares()
reg.Newton_Method()

[0. 0. 0. 0.]
Least Squares: 0.035672400657914376


  0%|          | 0/10000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [76]:
np.dot(X,reg.theta_ls)

array([9.14543395e-05, 1.18307899e-04, 1.26959165e-04, 1.50817799e-04])

In [77]:
np.dot(X,reg.theta)

array([0.0003884 , 0.0004709 , 0.00052044, 0.00057639])

In [78]:
np.dot(X,theta_star)

array([6.30950935e-05, 9.97654416e-05, 9.83755227e-05, 1.40936549e-04])

In [35]:
theta_star

array([0.95232579, 0.30508291])

In [84]:
np.dot(X.T,X)

array([[1.16656573e-06, 1.87761275e-06],
       [1.87761275e-06, 4.67131850e-06]])