
# 1 模型过拟合与欠拟合

## 1.1 基础代码

导入工具包，用于模型验证和数据处理。

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # 可视化 用于创建各种类型的统计图形

from scipy import stats # 用于统计分析

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression  #线性回归
from sklearn.neighbors import KNeighborsRegressor  #K近邻回归
from sklearn.tree import DecisionTreeRegressor     #决策树回归
from sklearn.ensemble import RandomForestRegressor #随机森林回归
from sklearn.svm import SVR  #支持向量回归
import lightgbm as lgb #lightGbm模型

from sklearn.model_selection import train_test_split # 切分数据
from sklearn.metrics import mean_squared_error #评价指标

from sklearn.linear_model import SGDRegressor # 随机梯度下降线性回归

读取数据

In [13]:
train_data_file = "./data/zhengqi_train.txt"
test_data_file =  "./data/zhengqi_test.txt"

train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

In [14]:
train_data.head()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V29,V30,V31,V32,V33,V34,V35,V36,V37,target
0,0.566,0.016,-0.143,0.407,0.452,-0.901,-1.812,-2.36,-0.436,-2.114,...,0.136,0.109,-0.615,0.327,-4.627,-4.789,-5.101,-2.608,-3.508,0.175
1,0.968,0.437,0.066,0.566,0.194,-0.893,-1.566,-2.36,0.332,-2.114,...,-0.128,0.124,0.032,0.6,-0.843,0.16,0.364,-0.335,-0.73,0.676
2,1.013,0.568,0.235,0.37,0.112,-0.797,-1.367,-2.36,0.396,-2.114,...,-0.009,0.361,0.277,-0.116,-0.843,0.16,0.364,0.765,-0.589,0.633
3,0.733,0.368,0.283,0.165,0.599,-0.679,-1.2,-2.086,0.403,-2.114,...,0.015,0.417,0.279,0.603,-0.843,-0.065,0.364,0.333,-0.112,0.206
4,0.684,0.638,0.26,0.209,0.337,-0.454,-1.073,-2.086,0.314,-2.114,...,0.183,1.078,0.328,0.418,-0.843,-0.215,0.364,-0.28,-0.028,0.384


归一化处理

In [15]:
from sklearn import preprocessing 
# 1读取特征名
features_columns = [col for col in train_data.columns if col not in ['target']]
# 2实例化
min_max_scaler = preprocessing.MinMaxScaler()
# 3训练集fit拟合
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
# 4transform
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])
# 5转dataframe-设特征列名
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns

test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
# 6添回target列
train_data_scaler['target'] = train_data['target']

PCA处理，特征降维

In [16]:
from sklearn.decomposition import PCA   #主成分分析法

#PCA方法降维
#保留16个主成分
pca = PCA(n_components=16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']

In [17]:
#采用 pca 保留16维特征的数据
new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']

# 切分数据 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)

## 1.2 欠拟合

In [18]:
clf = SGDRegressor(max_iter=500, tol=1e-2) 
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.15120805945139779
SGDRegressor test MSE:    0.15557353657705472


#### 代码解释
SGDRegressor 是一种基于梯度下降的线性回归模型，使用随机梯度下降算法进行参数估计适用于大规模数据集和高维特征。
与传统的批量梯度下降不同，随机梯度下降每次迭代只使用一个样本或一小批样本来更新模型参数，从而减少了内存消耗和计算复杂度。

主要参数：
- loss: 损失函数的类型。可选参数有 'squared_loss'（平方损失，默认）、'huber'（Huber 损失）、'epsilon_insensitive'（ϵ-insensitive 损失）等。
- penalty: 正则化项的类型。可选参数有 'l2'（L2 正则化，默认）、'l1'（L1 正则化）、'elasticnet'（弹性网正则化）等。
- alpha: 正则化项的惩罚力度。默认为0.0001。
- max_iter: 最大迭代次数。默认为1000。
- learning_rate: 学习率的类型或大小。可选参数有 'constant'（恒定学习率）、'optimal'（最优学习率）、'invscaling'（逆标度学习率）等。
- eta0: 初始学习率。默认为0.01。

主要方法：
- fit(X, y): 使用训练数据训练模型。
- predict(X): 对新的输入数据进行预测。
- score(X, y): 返回模型在给定测试数据上的 R^2 分数。


在 SGDRegressor 中，`tol` 是用来控制迭代的停止条件的参数。tol（tolerance）表示容忍度，即当损失函数的变化小于 tol 时，算法会停止迭代。`tol=1e-2` 表示容忍度为 0.01。也就是说，当连续两次迭代的损失函数值之差小于 0.01 时，算法会认为模型已经收敛，并且提前结束迭代，不再继续优化。

## 1.3 过拟合

In [19]:
from sklearn.preprocessing import PolynomialFeatures # 用于进行多项式特征转换。
poly = PolynomialFeatures(5)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3) 
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.1331534338938716
SGDRegressor test MSE:    0.14569967267109368


## 1.4正常拟合

In [20]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3) 
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.13411423459723712
SGDRegressor test MSE:    0.14259016962618046


#### 代码解释
`PolynomialFeatures` 是 scikit-learn（sklearn）库中的一个预处理类，用于生成多项式特征。它可以将原始特征转换为高阶多项式特征，从而扩展特征空间，使模型能够更好地拟合非线性关系。

`PolynomialFeatures` 的主要作用是通过对原始特征进行多项式扩展，引入多项式交互项，从而增加模型的表示能力。对于给定的一组原始特征 x1, x2, ..., xn，`PolynomialFeatures` 将创建由这些特征的所有可能的多项式组合组成的新特征矩阵。

`PolynomialFeatures` 可以生成包括以下几种特征的多项式：

- 指数项：x^d （d 为指定的度数）
- 交叉项：x1^i * x2^j * ... * xn^k （i + j + ... + k 不大于指定的度数）

使用 `PolynomialFeatures` 的步骤如下：

1. 创建 `PolynomialFeatures` 实例，并指定所需的度数。
2. 使用 `fit_transform` 方法将原始特征数据集转换为多项式特征数据集。


# 2 模型正则化

正则化(Regularization)是给需要训练的目标函数加上一些规则（限制），目的是为了防止过拟合。

## 2.1 L2范数正则化
$$
\parallel x\parallel_2=\left(\sum_{i=1}^n\mid x_i\mid^2\right)^{\frac{1}{2}}
$$
又叫欧几里得(Euclid)范数，即向量元素绝对值平方和再进行开方


In [21]:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'L2', alpha=0.0001) 
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.13405366167472654
SGDRegressor test MSE:    0.1425968901276022


## 2.2 L1范数正则化
$$
\parallel x\parallel_1=\sum_{i=1}^N\mid x_i\mid 
$$
即向量元素绝对值之和

In [22]:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'L1', alpha=0.00001) 
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.1341760756452732
SGDRegressor test MSE:    0.14289689287669896


## 2.3 ElasticNet 联合 L1和L2范数加权正则化

In [23]:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'elasticnet', l1_ratio=0.9, alpha=0.00001) 
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.13411468070761007
SGDRegressor test MSE:    0.14267147152205584


#### 代码解释

1. `elasticnet`：是 `SGDRegressor` 的正则化方法之一。Elastic Net 是一种结合了 L1 正则化（Lasso）和 L2 正则化（Ridge）的线性回归模型正则化方法。通过引入两种正则化项，Elastic Net 可以在处理高维数据时具有特征选择的能力，并且可以克服 Lasso 存在的某些限制。默认情况下，`penalty` 参数被设置为 `'l2'`，即使用 L2 正则化；而设置为 `'elasticnet'` 则表示同时使用 L1 和 L2 正则化。

2. `l1_ratio`：这是 Elastic Net 的混合参数，取值范围为 0 到 1 之间。它控制着 L1 正则化在 Elastic Net 中的比例。当 `l1_ratio` 为 0 时，相当于只使用 L2 正则化，而当 `l1_ratio` 为 1 时，相当于只使用 L1 正则化。在 0 和 1 之间的值表示混合使用两种正则化方法。在给定的示例中，`l1_ratio=0.9` 表示 Elastic Net 正则化主要使用 L1 正则化，较少使用 L2 正则化。

# 3 模型交叉验证

## 3.1 简单交叉验证 Hold-out-menthod

In [24]:
# 简单交叉验证
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)

clf = SGDRegressor(max_iter=1000, tol=1e-3) 
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.14153466248845575
SGDRegressor test MSE:    0.14704739394570965


## 3.2 K折交叉验证 K-fold CV

In [37]:
# 5折交叉验证
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)):
    train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3) 
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, " 折", "SGDRegressor train MSE:   ", score_train)
    print(k, " 折", "SGDRegressor test MSE:   ", score_test, '\n')  

0  折 SGDRegressor train MSE:    0.14981785673534742
0  折 SGDRegressor test MSE:    0.10625916839199373 

1  折 SGDRegressor train MSE:    0.133562712911519
1  折 SGDRegressor test MSE:    0.18214523287043694 

2  折 SGDRegressor train MSE:    0.14710883577575967
2  折 SGDRegressor test MSE:    0.13333375076428863 

3  折 SGDRegressor train MSE:    0.1407417857258413
3  折 SGDRegressor test MSE:    0.16241235381679514 

4  折 SGDRegressor train MSE:    0.13808450887575016
4  折 SGDRegressor test MSE:    0.1648578202917906 



#### 代码详解 -`KFold`函数
`KFold` 是 scikit-learn 库中的一个交叉验证方法，用于划分数据集为 k 折，并生成相应的训练集和测试集索引。

语法如下：

```python
sklearn.model_selection.KFold(n_splits, shuffle=False, random_state=None)
```

参数说明：
- `n_splits`：表示将数据集划分为几个折（即 k 值），默认为 5。
- `shuffle`（可选）：表示是否在划分之前对数据进行洗牌，默认为 False。如果设置为 True，则会在划分之前对数据进行洗牌以打乱顺序。
- `random_state`（可选）：表示随机数种子，用于指定洗牌时的随机性。设置相同的随机数种子可以保证每次划分的结果一致。

常用方法和属性：

- `split(X[, y, groups])`：返回一个生成器对象，用于生成每个折的训练集和测试集索引。
- `get_n_splits([X, y, groups])`：返回划分的折数（即 k 值）。

例子：

```python
import numpy as np
from sklearn.model_selection import KFold

# 创建一个 KFold 对象
kf = KFold(n_splits=5)

# 模拟一个数据集
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# 使用 KFold 进行划分
for train_index, test_index in kf.split(data):
    print("Train:", train_index, "Test:", test_index)
    
for train_i,test_i in kf.split(data):
    print(data[train_i],data[test_i])
    print("---------")
```

输出结果：
```
Train: [2 3 4 5 6 7 8 9] Test: [0 1]
Train: [0 1 4 5 6 7 8 9] Test: [2 3]
Train: [0 1 2 3 6 7 8 9] Test: [4 5]
Train: [0 1 2 3 4 5 8 9] Test: [6 7]
Train: [0 1 2 3 4 5 6 7] Test: [8 9]

[3  4  5  6  7  8  9 10] [1 2]
---------
[1  2  5  6  7  8  9 10] [3 4]
---------
[1  2  3  4  7  8  9 10] [5 6]
---------
[1  2  3  4  5  6  9 10] [7 8]
---------
[1  2  3  4  5  6  7  8] [9 10]
---------
```

在这个例子中，我们先创建了一个 `KFold` 对象 `kf`，将数据集 `data` 划分为 5 折交叉验证。然后，在循环中，我们使用 `kf.split(data)` 生成了每个折的训练集索引 `train_index` 和测试集索引 `test_index`。通过打印这些索引，我们可以看到每个折的训练集和测试集索引。

#### 代码详解 -`enumerate`函数
用于将一个可迭代对象转换为一个枚举对象。它返回一个包含索引和元素的元组的迭代器。

语法如下：

```python
enumerate(iterable, start=0)
```

- `iterable`：表示要进行枚举的可迭代对象，可以是列表、元组、字符串、集合等。
- `start`（可选）：表示索引的起始值，默认为 0。

当对一个可迭代对象使用 `enumerate` 函数时，它会返回一个生成器对象，每次迭代都会产生一个元组 `(index, element)`，其中 `index` 是当前元素的索引，从 `start` 开始递增，`element` 是对应的元素。

例子：

```python
fruits = ['apple', 'banana', 'orange']

for index, fruit in enumerate(fruits):
    print(index, fruit)
```

输出结果：
```
0 apple
1 banana
2 orange
```

在例子中，我们使用 `enumerate` 对列表 `fruits` 进行枚举。在每次迭代中，`index` 表示元素的索引，`fruit` 表示对应的水果名称。通过打印 `index` 和 `fruit`，我们可以看到每个元素的索引和对应的水果名称。


## 3.3 留一法 LOO CV

In [39]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
# num = 100
for k, (train_index, test_index) in enumerate(loo.split(train)):
    train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3) 
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, " 个", "SGDRegressor train MSE:   ", score_train)
    print(k, " 个", "SGDRegressor test MSE:   ", score_test, '\n') 
    if k >= 9: # k 大于等于 9时停止迭代
        break

0  个 SGDRegressor train MSE:    0.1416580179524758
0  个 SGDRegressor test MSE:    0.010223999832396254 

1  个 SGDRegressor train MSE:    0.14163634051720275
1  个 SGDRegressor test MSE:    0.11740687467896349 

2  个 SGDRegressor train MSE:    0.14160255971754088
2  个 SGDRegressor test MSE:    0.038511164208972805 

3  个 SGDRegressor train MSE:    0.14171424665066576
3  个 SGDRegressor test MSE:    0.0032671144219933086 

4  个 SGDRegressor train MSE:    0.1415871206184853
4  个 SGDRegressor test MSE:    0.011626980208807725 

5  个 SGDRegressor train MSE:    0.1415859083059502
5  个 SGDRegressor test MSE:    0.13684250563109712 

6  个 SGDRegressor train MSE:    0.14146539859028415
6  个 SGDRegressor test MSE:    0.02520351588864328 

7  个 SGDRegressor train MSE:    0.1416141346350202
7  个 SGDRegressor test MSE:    0.0010761193833298227 

8  个 SGDRegressor train MSE:    0.1410039783015725
8  个 SGDRegressor test MSE:    0.08343181857185722 

9  个 SGDRegressor train MSE:    0.14153627511981529
9

## 3.4 留P法 LPO CV

In [40]:
from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=10)
# num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
    train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3) 
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, " 10个", "SGDRegressor train MSE:   ", score_train)
    print(k, " 10个", "SGDRegressor test MSE:   ", score_test, '\n') 
    if k >= 9: # k 大于等于 9时停止迭代。
        break

0  10个 SGDRegressor train MSE:    0.14202520800291124
0  10个 SGDRegressor test MSE:    0.04886498124378371 

1  10个 SGDRegressor train MSE:    0.14203752041349277
1  10个 SGDRegressor test MSE:    0.045521294440001324 

2  10个 SGDRegressor train MSE:    0.14198180297296764
2  10个 SGDRegressor test MSE:    0.047209109806969386 

3  10个 SGDRegressor train MSE:    0.14195569188036283
3  10个 SGDRegressor test MSE:    0.05423002521313495 

4  10个 SGDRegressor train MSE:    0.14185028991845955
4  10个 SGDRegressor test MSE:    0.06901799971546987 

5  10个 SGDRegressor train MSE:    0.14201273518218
5  10个 SGDRegressor test MSE:    0.0447478630166654 

6  10个 SGDRegressor train MSE:    0.1419891732237632
6  10个 SGDRegressor test MSE:    0.04953265833338921 

7  10个 SGDRegressor train MSE:    0.14193217655848653
7  10个 SGDRegressor test MSE:    0.05323042636859763 

8  10个 SGDRegressor train MSE:    0.1420557933128271
8  10个 SGDRegressor test MSE:    0.04742538062352217 

9  10个 SGDRegressor tra

#### 留P交叉验证 和 K折交叉验证
区别和特点如下：

- 样本划分：留 P 交叉验证按照固定数量 P 的样本划分为测试集，剩余的样本为训练集；K 折交叉验证按照 K 个折的划分将数据集划分为测试集和训练集。
- 迭代次数：留 P 交叉验证的迭代次数取决于样本组合的可能性，通常较大；K 折交叉验证的迭代次数为 K，通常较小。
- 样本重复：留 P 交叉验证每个样本只出现一次作为测试集，可能会有样本重复出现在训练集中；K 折交叉验证每个样本会被分到不同的训练集和测试集中，避免了样本的重复。

# 4模型超参空间及调参

## 4.1穷举网格搜索

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)

randomForestRegressor = RandomForestRegressor()
parameters = {
              'n_estimators':[50, 100, 200],
              'max_depth':[1, 2, 3]
        }


clf = GridSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)

score_test = mean_squared_error(test_target, clf.predict(test_data))

print("RandomForestRegressor GridSearchCV test MSE:   ", score_test)
sorted(clf.cv_results_.keys())

## 4.2 随机参数优化

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)

randomForestRegressor = RandomForestRegressor()
parameters = {
              'n_estimators':[50, 100, 200, 300],
              'max_depth':[1, 2, 3, 4, 5]
        }


clf = RandomizedSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)

score_test = mean_squared_error(test_target, clf.predict(test_data))

print("RandomForestRegressor RandomizedSearchCV test MSE:   ", score_test)
sorted(clf.cv_results_.keys())

## 4.3 Lgb 调参

In [None]:
clf = lgb.LGBMRegressor(num_leaves=31)

parameters = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

clf = GridSearchCV(clf, parameters, cv=5)
clf.fit(train_data, train_target)

print('Best parameters found by grid search are:', clf.best_params_)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("LGBMRegressor RandomizedSearchCV test MSE:   ", score_test)

## 4.4 Lgb 线下验证

In [None]:
train_data2 = pd.read_csv('./zhengqi_train.txt',sep='\t')
test_data2 = pd.read_csv('./zhengqi_test.txt',sep='\t')

train_data2_f = train_data2[test_data2.columns].values
train_data2_target = train_data2['target'].values

In [None]:
# lgb 模型
from sklearn.model_selection  import KFold
import lightgbm as lgb
import numpy as np


# 5折交叉验证
Folds=5
kf = KFold(len(train_data2_f), n_splits=Folds, random_state=100, shuffle=True)
# 记录训练和预测MSE
MSE_DICT = {
    'train_mse':[],
    'test_mse':[]
}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(train_data2_f)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=100,
        boosting_type='gbdt',
        random_state=100,
        objective='regression',
    )
   
    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = train_data2_f[train_index], train_data2_f[test_index]
    y_train_KFold, y_test_KFold = train_data2_target[train_index], train_data2_target[test_index]
    
    # 训练模型
#     reg.fit(X_train_KFold, y_train_KFold)
    lgb_reg.fit(
            X=X_train_KFold,y=y_train_KFold,
            eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
            eval_names=['Train','Test'],
            early_stopping_rounds=100,
            eval_metric='MSE',
            verbose=50
        )


    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_) 
    
    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')
    
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')

# 5 学习曲线和验证曲线

## 5.1 学习曲线

In [None]:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection 
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


X = train_data2[test_data2.columns].values
y = train_data2['target'].values


title = "LinearRegression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = model_selection.ShuffleSplit(X.shape[0], n_splits=100,
                                   test_size=0.2, random_state=0)

estimator = SGDRegressor()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=-1)


## 5.2 验证曲线

In [None]:
print(__doc__)

import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import validation_curve

X = train_data2[test_data2.columns].values
y = train_data2['target'].values
# max_iter=1000, tol=1e-3, penalty= 'L1', alpha=0.00001

param_range = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
train_scores, test_scores = validation_curve(
    SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'L1'), X, y, param_name="alpha", param_range=param_range,
    cv=10, scoring='r2', n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SGDRegressor")
plt.xlabel("alpha")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()