### 梯度式子的推导过程

![%E6%A2%AF%E5%BA%A6%E5%90%91%E9%87%8F%E5%8C%961.png](attachment:%E6%A2%AF%E5%BA%A6%E5%90%91%E9%87%8F%E5%8C%961.png)

### 梯度化简的最终结果

![%E6%A2%AF%E5%BA%A6%E5%90%91%E9%87%8F%E5%8C%963.png](attachment:%E6%A2%AF%E5%BA%A6%E5%90%91%E9%87%8F%E5%8C%963.png)

### 推导结果

In [4]:
import numpy as np
from sklearn import datasets

In [5]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

X = X[y<50.0]
y  = y[y<50.0]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 666)

In [44]:
import numpy as np
from sklearn.metrics import r2_score

class LinearRegression:
    
    def __init__(self):
        """初始化Linear Regression"""
        self.coef_ = None            #系数,theta 1-n
        self.interception_ = None  #截距,theta 0
        self._theta = None           #theta 0-n

    def fit_normal(self, X_train, y_train):
        """根据训练数据集X_train, y_train训练Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        
        X_b = np.hstack( [np.ones( (len(X_train), 1) ), X_train] )      # len() 矩阵横向长度；hstack 横向维度连接
        self._theta = np.linalg.inv( X_b.T.dot(X_b) ).dot(X_b.T).dot(y_train)  #linalg.inv 逆矩阵
        
        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]
        
        return self
    
    
    def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
        """根据训练数据集X_train, y_train, 使用梯度下降法训练 Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"

        def J(theta, X_b, y): 
            """损失函数"""
            try:
                return np.sum( ( y -  X_b.dot(theta) ) ** 2 ) / len(X_b)
            except:
                return float('inf')

        def dJ(theta, X_b, y): 
            """损失函数的梯度（偏导）, 向量化"""
#             res = np.empty(len(theta))
#             res[0] = np.sum(X_b.dot(theta) - y)
#             for i in range(1, len(theta)):
#                 res[i] = np.sum( ( X_b.dot(theta) - y ).dot( X_b[ : , i ] ) )
#             return res * 2 / len(X_b)
            return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
#     return (2./len(X_b)) * X_b.T.dot( X_b.dot(theta) - y ) #自己写的

        def gredient_descent(X_b, y, initial_theta, eta, epsilon=1e-8, n_iters = 1e4):
            """依据梯度搜索的过程"""
            theta = initial_theta
            cur_iter = 0

            while cur_iter < n_iters:      ### i_iter 执行次数，n_iter 次数上限
                gradient = dJ(theta, X_b, y)
                last_theta = theta
                theta = theta - eta * gradient
                if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):   #浮点数判断 == 0
                    break

                cur_iter += 1

            return theta

        X_b = np.hstack( [np.ones((len(X_train), 1)), X_train] )    # np.ones((3,5))传进去的是元组，3行5列的1矩阵；或者np.ones(10)
        initial_theta = np.zeros( X_b.shape[1] )

        self._theta = gredient_descent(X_b, y_train, initial_theta, eta)

        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]

        return self
    
    
    def predict(self, X_predict):
        """给定待预测数据集X_predict, 返回X_predict的结果向量"""
        assert self.interception_ is not None and self.coef_ is not None, \
            "must fit before predict !"
        assert X_predict.shape[1] == len(self.coef_), \
            "the feature number of X_predict must be equal to X_train"
        
        X_b = np.hstack( [np.ones( (len(X_predict), 1) ), X_predict] )
        return X_b.dot(self._theta)
    
    def score(self, X_test, y_test):
        """根据测试数据集X_test 和 y_test确定当前模型的准确度"""
        y_predict = self.predict(X_test)
        return r2_score(y_test, y_predict)
    
    def __repr__(self):
        return "LinearRegression()"

In [8]:
# 使用线性回归正规化方程(向量化)

lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train, y_train)
lin_reg1.score(X_test, y_test)

CPU times: user 16.6 ms, sys: 18 ms, total: 34.6 ms
Wall time: 69.1 ms


0.8129794056212832

In [82]:
# 使用线性回归梯度下降(向量化)

lin_reg2 = LinearRegression()
lin_reg2.fit_gd(X_train, y_train)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


LinearRegression()

In [10]:
lin_reg2.coef_   # 上述运行会waning的原因就是
                      # eta太大了，导致搜索时走的步子太大了，会恰好跨过合适的点。。
                      # 但是若选用的eta太小，会导致在搜索次数内搜不到最合适的点，导致score值很低，此时可以改n_iter,多搜几次

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [85]:
# 使用线性回归梯度下降(向量化)，改用合适的 eta 和 n_iter

lin_reg2 = LinearRegression()
%time lin_reg2.fit_gd(X_train, y_train, eta=0.0000031, n_iters=1e6)

CPU times: user 3.01 s, sys: 331 ms, total: 3.34 s
Wall time: 3.79 s


LinearRegression()

In [86]:
lin_reg2.score(X_test, y_test)       #### error 这里不知道为什么还是概率很低？？？？？？？？？,感觉需要选择不同起始点搜索
# lin_reg2.score(X_test, y_test)
# 0.4121184591318168

0.4121184591318168

### 数据归一化(可以缓解eta过大过小导致搜索不准的情况)

In [87]:
from sklearn.preprocessing import StandardScaler

In [90]:
standardScaler = StandardScaler()
standardScaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [92]:
X_train_standard = standardScaler.transform(X_train)  #得到标准化后的X_train_standard

In [93]:
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard, y_train)

CPU times: user 1.24 s, sys: 125 ms, total: 1.37 s
Wall time: 1.42 s


LinearRegression()

In [94]:
X_test_standard = standardScaler.transform(X_test)  #得到标准化后的X_train_standard

In [95]:
lin_reg3.score(X_test_standard, y_test)

0.8129873310487505

In [None]:
### 上述说明 找到了合适的theta，而且运行速度加快了

### 梯度下降法比正规化方程的优势

In [98]:
m = 1000
n = 5000

big_X = np.random.normal(size = (m,n)) #按照正态分布随机生成，不需要归一化
true_theta = np.random.uniform(0.0, 100.0, size = n+1) #随机生成
big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0., 10., size=m)   #最后是噪音

In [99]:
#正规方程法

big_reg1 = LinearRegression()
%time big_reg1.fit_normal(big_X, big_y)

CPU times: user 31.1 s, sys: 1.87 s, total: 33 s
Wall time: 26.1 s


LinearRegression()

In [102]:
#梯度下降法

big_reg2 = LinearRegression()
%time big_reg2.fit_gd(big_X, big_y)

CPU times: user 15.5 s, sys: 607 ms, total: 16.1 s
Wall time: 12.1 s


LinearRegression()

In [None]:
## 所以对于正规方程，当mn的矩阵比较大时（因为要进行很多矩阵乘法），比较慢
## 对于梯度下降，当m（样本数）比较大时（因为要每一个样本参与计算？？？），比较慢