In [1]:
import numpy as np
import time
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.utils import read_data_from_file, sign, sigmoid
from lib.logistic_reg import logistic_reg, logistic_reg_sgd
from sklearn.preprocessing import PolynomialFeatures

In [2]:
def ridge_reg(X, y, lambd=0.01):
    """
    Linear Regression Algorithm(Closed Form)
    Args:
        X: 数据
        y: 预测值
        lambd: 罚项系数
    Returns:
        w_reg: 特征权重
    """    
    w_reg = np.linalg.inv((X.T).dot(X)+lambd*np.eye(len(X[0]))).dot(X.T).dot(y)
    return w_reg

In [3]:
def get_err_rate(X, y, w):
    err_rate = (sign(X.dot(w)) != y).mean()
    return err_rate

In [4]:
np.random.seed(0)

In [5]:
# 数据读取
data_train = read_data_from_file('hw4_train.dat') 
print('data_train shape: ', data_train.shape)
data_test = read_data_from_file('hw4_test.dat')
print('data_test shape: ', data_test.shape)

y = data_train[:,-1]
X = np.concatenate((np.ones((data_train.shape[0],1)), data_train[:,:-1]), axis=1)
y_test = data_test[:,-1]
X_test = np.concatenate((np.ones((data_test.shape[0],1)), data_test[:,:-1]), axis=1)

times = 2000

data_train shape:  (200, 3)
data_test shape:  (1000, 3)


13. Consider regularized linear regression (also called ridge regression) for classification$$w_{reg}=argmin_{w}(\frac{\lambda}{N}||w||^2+\frac{1}{N}||Xw−y||^2).$$
Run the algorithm on the following data set as $D$:<br/><br/>https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_algo/hw4_train.dat<br/><br/>and the following set for evaluating $E_{out}$<br/><br/>https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_algo/hw4_test.dat<br/><br/>Because the data sets are for classification, please consider only the 0/1 error for all Questions below.<br/>Let $\lambda = 10$, which of the followings is the corresponding $E_{in}$ and $E_{out}$?

In [6]:
w_reg = ridge_reg(X, y, lambd=10)
print('Ein: ', get_err_rate(X,y,w_reg), 'Eout: ', get_err_rate(X_test, y_test, w_reg))

Ein:  0.05 Eout:  0.045


14. Following the previous Question, aong $\log_{10} \lambda= \left\{2, 1, 0, -1, \ldots, -8, -9, -10 \right\}$. What is the $\lambda$ with the minimum $E_{in}$? Compute $\lambda$ and its corresponding $E_{in}$ and $E_{out}$ then select the closest answer. Break the tie by selecting the largest $\lambda$.

In [7]:
lambds = np.power(10., np.arange(2, -11, -1))
print(lambds)

[1.e+02 1.e+01 1.e+00 1.e-01 1.e-02 1.e-03 1.e-04 1.e-05 1.e-06 1.e-07
 1.e-08 1.e-09 1.e-10]


In [8]:
e_in, e_out = np.inf, np.inf
cor_lambda = 0
for lambd in lambds:
    w_reg = ridge_reg(X, y, lambd=lambd)
    e_in_tmp = get_err_rate(X,y,w_reg)
    if e_in > e_in_tmp:
        e_in = e_in_tmp   
        e_out = get_err_rate(X_test, y_test, w_reg)
        cor_lambda = lambd
print('lambda: ', cor_lambda, 'Ein: ', e_in, 'Eout: ', e_out)

lambda:  1e-08 Ein:  0.015 Eout:  0.02


15. Following the previous Question, among $\log_{10} \lambda= \left\{2, 1, 0, -1, \ldots, -8, -9, -10 \right\}$. What is the $\lambda$ with the minimum $E_{out}$? Compute $\lambda$ and the corresponding $E_{in}$ and $E_{out}$ then select the closest answer. Break the tie by selecting the largest $\lambda$.

In [9]:
e_in, e_out = np.inf, np.inf
cor_lambda = 0
for lambd in lambds:
    w_reg = ridge_reg(X, y, lambd=lambd)
    e_out_tmp = get_err_rate(X_test, y_test, w_reg)
    if e_out > e_out_tmp:
        e_out = e_out_tmp   
        e_in = get_err_rate(X, y, w_reg)
        cor_lambda = lambd
print('lambda: ', cor_lambda, 'Ein: ', e_in, 'Eout: ', e_out)

lambda:  1e-07 Ein:  0.03 Eout:  0.015


16. Now split the given training examples in $D$ to the first $120$ examples for $D_{train}$ and $80$ for $D_{val}$. {Ideally, you should randomly do the $120/80$ split. Because the given examples are already randomly permuted, however, we would use a fixed split for the purpose of this problem.}<br/><br/>Run the algorithm on $D_{train}$ to get $g^{-}_{\lambda}$, and validate $g^{-}_{\lambda}$ with $D_{val}$. Among $\log_{10} \lambda= \left\{2, 1, 0, -1, \ldots, -8, -9, -10 \right\}$. What is the $\lambda$ with the minimum $E_{train}$($g^{-}_{\lambda}$)? Compute $\lambda$ and the corresponding $E_{train}$($g^{-}_{\lambda}$), $E_{val}$($g^{-}_{\lambda}$) and $E_{out}$($g^{-}_{\lambda}$) then select the closet answer. Break the tie by selecting the largest $\lambda$.

In [10]:
result = []
for lambd in lambds:
    w_reg = ridge_reg(X[:120], y[:120], lambd=lambd)
    result.append((lambd, get_err_rate(X[:120], y[:120], w_reg), get_err_rate(X[-80:], y[-80:], w_reg), get_err_rate(X_test, y_test, w_reg)))

In [11]:
list.sort(result, key=lambda x: x[1])
print('lambda: {}, E_train: {}, E_val: {}, E_out: {}'.format(*result[0]))

lambda: 1e-08, E_train: 0.0, E_val: 0.05, E_out: 0.025


17. Following the previous Question, among $\log_{10} \lambda= \left\{2, 1, 0, -1, \ldots, -8, -9, -10 \right\}$. What is the $\lambda$ with the minimum $E_{val}$($g^{-}_\lambda$})? Compute $\lambda$ and the corresponding $E_{train}$($g^{-}_{\lambda}$), $E_{val}$($g^{-}_{\lambda}$) and $E_{out}$($g^{-}_{\lambda}$) then select the closet answer. Break the tie by selecting the largest $\lambda$.

In [12]:
list.sort(result, key=lambda x: x[2])
print('lambda: {}, E_train: {}, E_val: {}, E_out: {}'.format(*result[0]))

lambda: 1.0, E_train: 0.03333333333333333, E_val: 0.0375, E_out: 0.028


18. Run the algorithm with the optimal $\lambda$ of the previous Question on the whole $D$ to get $g_{\lambda}$. Compute $E_{in}$($g_{\lambda}$) and $E_{out}$($g_{\lambda}$) then select the closet answer.

In [13]:
w_reg = ridge_reg(X, y, lambd=result[0][0])
print('E_in: ', get_err_rate(X, y, w_reg), 'E_out: ', get_err_rate(X_test, y_test, w_reg))

E_in:  0.035 E_out:  0.02


19. For Questions 19-20, split the given training examples in $D$ to five folds, the first $40$ being fold $1$, the next $40$ being fold $2$, and so on. Again, we take a fixed split because the given examples are already randomly permuted.<br/><br/>Among $\log_{10} \lambda= \left\{2, 1, 0, -1, \ldots, -8, -9, -10 \right\}$. What is the $\lambda$ with the minimum $E_{cv}$ comes from the five folds defined above? Compute $\lambda$ and the corresponding $E_{cv}$ then select the closet answer. Break the tie by selecting the largest $\lambda$.

In [14]:
result = []
for lambd in lambds:
    e_cv = 0
    for i in range(0, 5):
        cv_index = range(i*40, i*40 + 40)
        train_index = list(set(range(len(X))) - set(cv_index))
        
        w_reg = ridge_reg(X[train_index], y[train_index], lambd=lambd)
        e_cv += get_err_rate(X[cv_index], y[cv_index], w_reg)
    result.append((lambd, e_cv/5))

In [15]:
list.sort(result, key=lambda x: x[1])
print('lambd: {}, E_cv: {}'.format(*result[0]))

lambd: 1e-08, E_cv: 0.03


20. Run the algorithm with the optimal $\lambda$ of the previous problem on the whole $D$ to get $g_{\lambda}$. Compute $E_{in}$($g_{\lambda}$) and $E_{out}$($g_{\lambda}$) then select the closet answer.

In [16]:
w_reg = ridge_reg(X, y, lambd=result[0][0])
print('E_in: ', get_err_rate(X, y, w_reg), 'E_out: ', get_err_rate(X_test, y_test, w_reg))

E_in:  0.015 E_out:  0.02
