In [1]:
import numpy as np
from scipy import linalg


In [2]:
def solve_sym(xtx, xty):
    L = linalg.cholesky(xtx)
    return linalg.lapack.dpotrs(L, xty)[0]

def turnbits_rec(p):
    if (p==1):
        return np.array([[True, False], [True, False]])
    else:
        up = np.c_[turnbits_rec(p-1), np.array([False]*(2**(p-1))).reshape(2**(p-1), 1)]
        down = np.c_[turnbits_rec(p-1), np.array([True]*(2**(p-1))).reshape(2**(p-1), 1)]
        return np.r_[up, down]


 $C_p$, $AIC$和交叉验证的最优子集回归

In [3]:
class BestSubsetsReg(object):
    def __init__(self, x = 0, y = 0, inter = True, isCp = True, isAIC = True, isCV = True):
        self.n, self.p = x.shape
        if inter:
            self.x = np.c_[np.ones((self.n, 1)), x]
        else:
            self.x = x
        self.y = y
        self.xx = np.dot(self.x.T, self.x)
        self.xy = np.dot(self.x.T, self.y)
        self.ind_var = turnbits_rec(self.p)
        self.b = []
        if isCp:
            self.Cp = 0
        if isAIC:
            self.AIC = 0
        if isCV:
            self.CVerr = 0
            
    def reg(self):
        self.b = [solve_sym(self.xx[ind][:,ind], self.xy[ind]) for ind in self.ind_var]
        #return self.b
        
    def Cp_AIC(self,isCp = True, isAIC = True):
        mse_tmp = [np.sum(np.dot(self.xx[ind][:,ind], beta) * beta) for ind, beta in zip(self.ind_var, self.b)]
        rss = np.sum(self.y * self.y) - mse_tmp
        d = np.sum(self.ind_var, axis = 1)
        if isCp:
            self.Cp = [rss + 2 * d * rss[-1]/(self.n - self.p - 1)]
            min_Cp = np.argmin(self.Cp)
            print([self.ind_var[min_Cp][1:]])
        if isAIC:
            self.AIC = self.n * np.log(rss) + 2 * d
            min_AIC = np.argmin(self.AIC)
            print([self.ind_var[min_AIC][1:]])
        
    def CVreg(self):
        k = 10
        indexs = np.array_split(np.random.permutation(np.arange(0, self.n)), k)
        def cvk(ind, index):
            txx = self.xx[ind][:,ind] - np.dot(self.x[index][:, ind].T, self.x[index][:, ind])
            txy = self.xy[ind] - np.dot(self.x[index][:, ind].T, self.y[index])
            tcoe = solve_sym(txx, txy)
            return np.sum((self.y[index] - np.dot(self.x[index][:, ind], tcoe)) ** 2)
        self.CVerr = np.sum(np.array([cvk(ind, index) for ind in self.ind_var for index in indexs]).reshape(2**self.p, k), axis = 1)/self.n
        min_CV = np.argmin(self.CVerr)
        return [self.ind_var[min_CV][1:]]

测试使用类的输出结果

In [6]:
x = np.random.rand(1000*10).reshape(1000, 10)
y = np.random.rand(1000)
ex_1 = BestSubsetsReg(x, y)
ex_1.reg()
ex_1.Cp_AIC()
ex_1.CVreg()

[array([False, False, False, False, False, False,  True, False, False,
       False])]
[array([False, False, False, False, False, False,  True, False, False,
       False])]


[array([False, False, False, False, False, False,  True, False, False,
        False])]

In [5]:
import os
import sys
os.chdir("E:/Data Mining/0306/Best Subset Regression")
ex_2_x = np.loadtxt("./prostate/x.txt", delimiter=",")
ex_2_y = np.loadtxt("./prostate/y.txt", delimiter=",")

ex_2 = BestSubsetsReg(ex_2_x, ex_2_y)
ex_2.reg()
ex_2.Cp_AIC()
ex_2.CVreg()

[array([False,  True, False, False,  True,  True,  True, False])]
[array([False,  True, False, False,  True,  True,  True, False])]


[array([False,  True, False, False,  True,  True,  True, False])]