# Lecture 7--Training

## 1. Programming: Linear/Quadratic Programming

When performing optimization, it is sometimes possible to reduce a given problem to a linear or quadratic programming problem. Such problems have extremely fast solutions, which we can use to speed up our code. The downside is that we have to do a little math to put them in this form.

### 1.1 Linear Programming

$$
\begin{align}
\min_{x,s}\;&c'x \\
\text{subject to}\;& Gx+s=h \\
& Ax=b \\
& s\geq0
\end{align}
$$

### 1.2 Quadratic Programming

$$
\begin{align}
\min_{x}\;&(1/2)x'Px+q'x \\
\text{subject to}\;& Gx\leq h \\
& Ax=b
\end{align}
$$

## 2. Data Science

### 2.1 $L_1$ Regularization

Given a model:
$$y=\mathbf X\beta+e$$
The $L_1$ regularization of the model is defined by:
$$
\begin{align}
\min_\beta \;& e'e \\
\text{subject to}\;&\Vert\beta\Vert_1 \leq T
\end{align}
$$
Which is equivalent to the summation notation version:
$$
\begin{align}
\min_\beta \;& \sum_{i=1}^n e_i^2\\
\text{subject to}\;&\sum_{j=1}^r\vert\beta_j\vert \leq T
\end{align}
$$
Solving for $\beta$ we obtain:
$$
\begin{align}
\min_\beta \;& (1/2)\beta'\mathbf X'\mathbf X \beta-y'\mathbf X \beta\\
\text{subject to}\;&\mathbf 1_n'\vert\beta\vert \leq T
\end{align}
$$ 
To put this in terms where we can use the quadratic programming format we need to split the betas into their positive and negative components: $\beta=\beta_+-\beta_-$ So we define $\beta_\pm = [\beta_+',\beta_-']'$
$$
\begin{align}
\min_{\beta_{\pm}} \;& (1/2)\beta_\pm'\left(
\begin{bmatrix}
1 & -1 \\
-1 & 1 
\end{bmatrix}
\otimes\mathbf X'\mathbf X \right)\beta_\pm-\left(
\begin{bmatrix}
1 \\
-1  
\end{bmatrix}
\otimes y'\mathbf X \right) \beta_{\pm}\\
\text{subject to}\;&\mathbf 1_{2n}'\beta_\pm \leq T\\
& -\mathbf I_{2n} \beta_{\pm}\leq 0
\end{align}
$$

In [3]:
!pip install --user cvxopt



In [1]:
import cvxopt as cvx

ImportError: DLL load failed: The specified module could not be found.

In [12]:
import numpy as np
import pandas as pd
from cleands import *
from scipy.stats import zscore

df = pd.read_csv('BWGHT.csv')
npx = zscore(df[['cigs','faminc','male','white']].values)
npy = zscore(df['bwght'].values)
#ones = np.ones((npx.shape[0],1))
#npx = np.hstack((ones,npx))

In [None]:
npx

In [None]:
import cvxopt  as cvx
thresh = 2.3
P = np.kron(np.array([[1,-1],[-1,1]]),npx.T@npx)
q = np.kron(np.array([[1],[-1]]),npx.T@npx)

G_1 = -np.eye(2*npx.shape[1])
h_1 = np.zeros((2*npx.shape[1],1))
G_2 = np.ones([1,2*npx.shape[1]])
h_2 = np.array([thresh])
G = np.vstack([G_1,G_2])
h = np.vstack([h_1,h_2])
cvx.solvers.qp(cvx.matrix(P), cvx.matrix(q), cvx.matrix(G), cvx.matrix(h))
opt = opt['x']
opt = np.array(opt)
np.vstack([np.eye(npx.shpae[1]),-np.eye(npx.shape[1])]).T@opt


In [None]:
import numpy as np
import pandas as pd
from cleands import *

bwght = pd.read_csv('BWGHT.csv')
npx = bwght[['cigs','faminc','male','white']].values
ones = np.ones([npx.shape[0],1])
npx = np.hstack([ones,npx])
npy = bwght['bwght'].values

In [None]:
import cvxopt as cvx
import cvxopt.solvers as solv

def solve_lasso(x,y,thresh:float):
    (n,r) = x.shape
    P = cvx.matrix(np.kron(np.matrix([[1,-1],[-1,1]]),x.T@x))
    q = cvx.matrix(np.kron(np.matrix([[1],[-1]]),x.T@y.reshape(-1,1)))
    G = cvx.matrix(np.vstack((-np.eye(2*r),np.ones((1,2*r)))))
    h = cvx.matrix(np.vstack((np.zeros((2*r,1)),thresh)))
    b = np.array(solv.qp(P,q,G,h)['x'])
    b = b[r:,0]-b[:r,0]
    if len(y.shape) == 1: return b.reshape(-1,)
    else: return b

In [None]:
solve_lasso(npx,npy,thresh=115)

In [13]:
from cleands import *
class l1_regularization_regressor(least_squares_regressor):
    def __init__(self,x,y,thresh:float,*args,**kwargs):
        super(l1_regularization_regressor,self).__init__(x,y,thresh=thresh,*args,**kwargs)
        self.threshold=thresh
    def __fit__(self,x,y,thresh:float,*args,**kwargs):
        if x[:,0].var()==0:
            dx = x[:,1:]-x[:,1:].mean(0)
            dy = y-y.mean(0)
            outp = solve_lasso(dx,dy,thresh)
            intc = y.mean(0)-x[:,1:].mean(0)@outp.reshape(-1,1)
            return np.concatenate([intc,outp])
        else:
            return solve_lasso(x,y,thresh)
        
l1_regularization_regressor(npx,npy,thresh=1.0).params

TypeError: __init__() got an unexpected keyword argument 'thresh'

### 2.2 Cross-validation

In [3]:
def mean_squared_error(model,x,y):
    ypred = model.predict(x)
    return ((y-ypred)**2).mean()

k=5
np.random.seed(90210)
n = npy.shape[0]
deck = np.arrange(n)
np.random.shuffle(deck)
test = deck[0:int((n/k))]
train = deck[int(n/k):n]
model = l1_regularization_regressor)(npx[train],npy[train],thresh=1)
mspe = mean_squared_error(model,npx[test], npy[test])

test = deck[int(n/k):int(n/k*2)]
train_l = deck[;int(n/k)]
train_u = deck[int(n/k*2):]
train = np.concatenate([train_l,train_u])
model = l1_regularization_regressor)(npx[train],npy[train],thresh=1)
mspe = mean_squared_error(model,npx[test], npy[test])


SyntaxError: invalid syntax (<ipython-input-3-0a29b8f3c56c>, line 12)

In [8]:
def mean_squared_error(model,x,y):
    res = y-model.predict(x)
    return res.T@res/y.shape[0]

def kfold_cross_validation(model,x,y,statistic=mean_squared_error,folds:int=5,seed=None):
    if seed!=None: np.random.seed(seed)
    n = y.shape[0]
    deck = np.arange(n)
    np.random.shuffle(deck)
    mse = []
    for k in range(folds):
        test = deck[int(n/folds)*k:int(n/folds)*(k+1)]
        train_lower = deck[:int(n/folds)*k]
        train_upper = deck[int(n/folds)*(k+1):]
        train = np.concatenate([train_lower,train_upper])
        mse += [statistic(model(x[train],y[train]),x[test],y[test])]
    return np.array(mse)

In [None]:
import matplotlib as mlp

x = np.range(1,10)
y - np.

In [9]:
class l1_cross_validation_regressor(l1_regularization_regressor):
    def __init__(self,x,y,max_thresh=None,folds:int=5,statistic=mean_squared_error,*args,**kwargs):
        default_state = solv.options['show_progress']
        solv.options['show_progress'] = False
        if max_thresh==None: max_thresh = least_squares_regressor(x,y).params[1:].sum()
        outp = []
        for lam in np.linspace(0,1,100):
            model = lambda x,y: l1_regularization_regressor(x,y,thresh=lam*max_thresh)
            mse = kfold_cross_validation(model,x,y,folds=folds,statistic=statistic,seed=5).mean()
            outp += [(mse,lam)]
        outp = np.array(outp)
        thresh = outp[outp[:,0].argmin(),1]*max_thresh
        solv.options['show_progress'] = default_state
        super(l1_cross_validation_regressor,self).__init__(x,y,thresh=thresh,*args,**kwargs)
model = l1_cross_validation_regressor(npx,npy)
print(model.params)
print(model.threshold)

NameError: name 'l1_regularization_regressor' is not defined

## 3. Programming challenges

### 3.1 Tree Simulation

Write a monte carlo simulation comparing the accuracy of tree based models to that of regression for categorical data.

### 3.2 Recursive partitioning until non-rejection

Modify our recursive partitioning code to test the null that the two groups are equal and stop splitting when the null cannot be rejected. Bonus points if you use the Bonferonni correction when making the decision to split.