In [71]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm as lgb
plt.rcParams["figure.figsize"] = (20,5) # 设置pyplot绘制的图片大小

In [72]:
lgb.__version__

'4.0.0'

## 数据获取并处理

In [73]:
#load_dataset
lanes = pd.read_csv('../data/Lane.csv')  
light = pd.read_csv('../data/Light_status.csv')
roads = pd.read_csv('../data/Entrance_road.csv') 
flow = pd.read_csv('../data/Flow.csv')

## Light_status.csv

In [74]:
light.head()

Unnamed: 0,CYCLE_START_TIME,STAGE_START_TIME,STAGE_END_TIME,STAGE_LENGTH,GREEN_TIME,GREEN_FLASH_TIME,YELLOW_TIME,ALL_RED_TIME,CHANNELS,LANES,PHASES,LANE_FUNCS
0,2023-08-01 0:01:08,2023-08-01 00:01:08,2023-08-01 00:01:30,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113
1,2023-08-01 0:02:18,2023-08-01 00:02:18,2023-08-01 00:02:40,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113
2,2023-08-01 0:04:38,2023-08-01 00:04:38,2023-08-01 00:05:00,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113
3,2023-08-01 0:05:48,2023-08-01 00:05:48,2023-08-01 00:06:10,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113
4,2023-08-01 0:06:58,2023-08-01 00:06:58,2023-08-01 00:07:20,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113


In [75]:
# 转换成日期属性
light['CYCLE_START_TIME'] = pd.to_datetime(light['CYCLE_START_TIME'])
light['STAGE_START_TIME'] = pd.to_datetime(light['STAGE_START_TIME'])
light['STAGE_END_TIME'] = pd.to_datetime(light['STAGE_END_TIME'])

In [76]:
# 删除GREEN_FLASH_TIME YELLOW_TIME无用列
light.drop('GREEN_FLASH_TIME',axis=1,inplace=True)
light.drop('YELLOW_TIME',axis=1,inplace=True)

In [77]:
# 将LANES列转为对应的1W 1E 2W形式
light['released_lanes'] = light['LANES'].str.split(',') # 用，分割

In [78]:
# 将放行车道号“_”前代表的进口道，1，2，3，4替换为为W，N，E，S。

# 定义一个替换函数 replace_dir
def replace_lanes(lanes):
    replaced = []
    for i in lanes:
        i = i.replace('1_', 'W_')
        i = i.replace('2_', 'N_')
        i = i.replace('3_', 'E_')
        i = i.replace('4_', 'S_')
        
        replaced.append(i)
    
    return replaced

# apply替换
light['released_lanes'] = light['released_lanes'].apply(replace_lanes)

In [79]:
# 调换顺序，改为前面为车道号，后面为进口道，并去掉下划线
light['released_lanes'] = light['released_lanes'].apply(lambda lanes: [lane.replace('_', '')[-1] + lane.replace('_', '')[:-1] for lane in lanes])

In [80]:
# 按LANES分组
lanes_unique = light['LANES'].unique()
lanes_unique

array(['1_2,1_3,3_2,1_4,3_3,3_4', '1_1,3_1', '2_2,2_3,2_4,4_2,4_3,4_4',
       '2_1,4_1'], dtype=object)

In [81]:
# 按LANES分组
ligth_2W3W2E4W3E4E=light[light['LANES']=='1_2,1_3,3_2,1_4,3_3,3_4']
ligth_1W1E=light[light['LANES']=='1_1,3_1']
ligth_2N3N4N2S3S4S=light[light['LANES']=='2_2,2_3,2_4,4_2,4_3,4_4']
ligth_1N1S=light[light['LANES']=='2_1,4_1']

In [82]:
# 删除STAGE_END_TIME，LANES
ligth_2W3W2E4W3E4E.drop(['STAGE_END_TIME','LANES'],axis=1,inplace=True)
ligth_1W1E.drop(['STAGE_END_TIME','LANES'],axis=1,inplace=True)
ligth_2N3N4N2S3S4S.drop(['STAGE_END_TIME','LANES'],axis=1,inplace=True)
ligth_1N1S.drop(['STAGE_END_TIME','LANES'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [83]:
# ligth_2W3W2E4W3E4E
# 删除2023-08-01 00:05:00前的数据
ligth_2W3W2E4W3E4E=ligth_2W3W2E4W3E4E.drop(index=ligth_2W3W2E4W3E4E.index[:3])
ligth_2W3W2E4W3E4E=ligth_2W3W2E4W3E4E.reset_index(drop=True)
# 将CYCLE_START_TIME设置为时间索引
ligth_2W3W2E4W3E4E=ligth_2W3W2E4W3E4E.set_index('CYCLE_START_TIME')
# 对数据进行重新采样，以五分钟为一个时间段，并选择从00:05:00开始的数据：
start_time = pd.to_datetime("2023-08-01 00:05:00")
ligth_2W3W2E4W3E4E = ligth_2W3W2E4W3E4E.resample('5Min',label='left', closed='right').first().loc[start_time:] 

In [84]:
# ligth_1W1E
# 删除2023-08-01 00:05:00前的数据
ligth_1W1E=ligth_1W1E.drop(index=ligth_1W1E.index[:3])
ligth_1W1E=ligth_1W1E.reset_index(drop=True)
# 将CYCLE_START_TIME设置为时间索引
ligth_1W1E=ligth_1W1E.set_index('CYCLE_START_TIME')
# 对数据进行重新采样，以五分钟为一个时间段，并选择从00:05:00开始的数据：
start_time = pd.to_datetime("2023-08-01 00:05:00")
ligth_1W1E = ligth_1W1E.resample('5Min',label='left', closed='right').first().loc[start_time:] 

In [85]:
# ligth_2N3N4N2S3S4S
# 删除2023-08-01 00:05:00前的数据
ligth_2N3N4N2S3S4S=ligth_2N3N4N2S3S4S.drop(index=ligth_2N3N4N2S3S4S.index[:3])
ligth_2N3N4N2S3S4S=ligth_2N3N4N2S3S4S.reset_index(drop=True)
# 将CYCLE_START_TIME设置为时间索引
ligth_2N3N4N2S3S4S=ligth_2N3N4N2S3S4S.set_index('CYCLE_START_TIME')
# 对数据进行重新采样，以五分钟为一个时间段，并选择从00:05:00开始的数据：
start_time = pd.to_datetime("2023-08-01 00:05:00")
ligth_2N3N4N2S3S4S = ligth_2N3N4N2S3S4S.resample('5Min',label='left', closed='right').first().loc[start_time:] 

In [86]:
# ligth_1N1S
# 删除2023-08-01 00:05:00前的数据
ligth_1N1S=ligth_1N1S.drop(index=ligth_1N1S.index[:3])
ligth_1N1S=ligth_1N1S.reset_index(drop=True)
# 将CYCLE_START_TIME设置为时间索引
ligth_1N1S=ligth_1N1S.set_index('CYCLE_START_TIME')
# 对数据进行重新采样，以五分钟为一个时间段，并选择从00:05:00开始的数据：
start_time = pd.to_datetime("2023-08-01 00:05:00")
ligth_1N1S = ligth_1N1S.resample('5Min',label='left', closed='right').first().loc[start_time:] 

## Flow.csv

In [87]:
flow['START_TIME'] = pd.to_datetime(flow['START_TIME'])
flow['END_TIME'] = pd.to_datetime(flow['END_TIME'])

In [88]:
flow['LANE_ARM']=flow['LANE_ID'].astype(str)+flow['ARM_ID']

In [89]:
lane_arm = flow.pop("LANE_ARM")
flow.insert(2,'LANE_ARM',lane_arm)

In [90]:
flow.drop(['LANE_ID','ARM_ID'],axis = 1,inplace = True)

In [91]:
LANE_ARM_uniqe =flow['LANE_ARM'].unique()
LANE_ARM_uniqe

array(['1E', '1N', '1S', '1W', '2E', '2N', '2S', '2W', '3E', '3N', '3S',
       '3W', '4E', '4N', '4S', '4W'], dtype=object)

In [92]:
# 按LANE_ARM分组
flow_1E = flow[flow['LANE_ARM']=='1E']
flow_1N = flow[flow['LANE_ARM']=='1N']
flow_1S = flow[flow['LANE_ARM']=='1S']
flow_1W = flow[flow['LANE_ARM']=='1W']
flow_2E = flow[flow['LANE_ARM']=='2E']
flow_2N = flow[flow['LANE_ARM']=='2N']
flow_2S = flow[flow['LANE_ARM']=='2S']
flow_2W = flow[flow['LANE_ARM']=='2W']
flow_3E = flow[flow['LANE_ARM']=='3E']
flow_3N = flow[flow['LANE_ARM']=='3N']
flow_3S = flow[flow['LANE_ARM']=='3S']
flow_3W = flow[flow['LANE_ARM']=='3W']
flow_4E = flow[flow['LANE_ARM']=='4E']
flow_4N = flow[flow['LANE_ARM']=='4N']
flow_4S = flow[flow['LANE_ARM']=='4S']
flow_4W = flow[flow['LANE_ARM']=='4W']

In [93]:
flow_LANE_ARM = [flow_1E
,flow_1N
,flow_1S
,flow_1W
,flow_2E
,flow_2N
,flow_2S
,flow_2W
,flow_3E
,flow_3N
,flow_3S
,flow_3W
,flow_4E
,flow_4N
,flow_4S
,flow_4W]

In [94]:
flow_1E = flow_1E.set_index('START_TIME')
flow_1N = flow_1N.set_index('START_TIME')
flow_1S = flow_1S.set_index('START_TIME')
flow_1W = flow_1W.set_index('START_TIME')
flow_2E = flow_2E.set_index('START_TIME')
flow_2N = flow_2N.set_index('START_TIME')
flow_2S = flow_2S.set_index('START_TIME')
flow_2W = flow_2W.set_index('START_TIME')
flow_3E = flow_3E.set_index('START_TIME')
flow_3N = flow_3N.set_index('START_TIME')
flow_3S = flow_3S.set_index('START_TIME')
flow_3W = flow_3W.set_index('START_TIME')
flow_4E = flow_4E.set_index('START_TIME')
flow_4N = flow_4N.set_index('START_TIME')
flow_4S = flow_4S.set_index('START_TIME')
flow_4W = flow_4W.set_index('START_TIME')

In [95]:
merged_flow_1E = pd.merge(flow_1E,ligth_1W1E, left_index=True, right_index=True, how='left')
merged_flow_1N = pd.merge(flow_1N,ligth_1N1S, left_index=True, right_index=True, how='left')
merged_flow_1S = pd.merge(flow_1S,ligth_1N1S, left_index=True, right_index=True, how='left')
merged_flow_1W = pd.merge(flow_1W,ligth_1W1E, left_index=True, right_index=True, how='left')
merged_flow_2E = pd.merge(flow_2E,ligth_2W3W2E4W3E4E, left_index=True, right_index=True, how='left')
merged_flow_2N = pd.merge(flow_2N,ligth_2N3N4N2S3S4S, left_index=True, right_index=True, how='left')
merged_flow_2S = pd.merge(flow_2S,ligth_2N3N4N2S3S4S, left_index=True, right_index=True, how='left')
merged_flow_2W = pd.merge(flow_2W,ligth_2W3W2E4W3E4E, left_index=True, right_index=True, how='left')
merged_flow_3E = pd.merge(flow_3E,ligth_2W3W2E4W3E4E, left_index=True, right_index=True, how='left')
merged_flow_3N = pd.merge(flow_3N,ligth_2N3N4N2S3S4S, left_index=True, right_index=True, how='left')
merged_flow_3S = pd.merge(flow_3S,ligth_2N3N4N2S3S4S, left_index=True, right_index=True, how='left')
merged_flow_3W = pd.merge(flow_3W,ligth_2W3W2E4W3E4E, left_index=True, right_index=True, how='left')
merged_flow_4E = pd.merge(flow_4E,ligth_2W3W2E4W3E4E, left_index=True, right_index=True, how='left')
merged_flow_4N = pd.merge(flow_4N,ligth_2N3N4N2S3S4S, left_index=True, right_index=True, how='left')
merged_flow_4S = pd.merge(flow_4S,ligth_2N3N4N2S3S4S, left_index=True, right_index=True, how='left')
merged_flow_4W = pd.merge(flow_4W,ligth_2W3W2E4W3E4E, left_index=True, right_index=True, how='left')

In [96]:
df_1E = merged_flow_1E.copy()
df_1N = merged_flow_1N.copy()
df_1S = merged_flow_1S.copy()
df_1W = merged_flow_1W.copy()
df_2E = merged_flow_2E.copy()
df_2N = merged_flow_2N.copy()
df_2S = merged_flow_2S.copy()
df_2W = merged_flow_2W.copy()
df_3E = merged_flow_3E.copy()
df_3N = merged_flow_3N.copy()
df_3S = merged_flow_3S.copy()
df_3W = merged_flow_3W.copy()
df_4E = merged_flow_4E.copy()
df_4N = merged_flow_4N.copy()
df_4S = merged_flow_4S.copy()
df_4W = merged_flow_4W.copy()

In [97]:
df_1E

Unnamed: 0_level_0,LANE_ARM,END_TIME,VOLUMN_5MIN,STAGE_START_TIME,STAGE_LENGTH,GREEN_TIME,ALL_RED_TIME,CHANNELS,PHASES,LANE_FUNCS,released_lanes
START_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-08-01 00:05:00,1E,2023-08-01 00:10:00,2,2023-08-01 00:06:10,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
2023-08-01 00:10:00,1E,2023-08-01 00:15:00,1,2023-08-01 00:10:50,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
2023-08-01 00:15:00,1E,2023-08-01 00:20:00,0,2023-08-01 00:15:30,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
2023-08-01 00:20:00,1E,2023-08-01 00:25:00,1,2023-08-01 00:21:20,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
2023-08-01 00:25:00,1E,2023-08-01 00:30:00,0,2023-08-01 00:26:00,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
...,...,...,...,...,...,...,...,...,...,...,...
2023-08-27 23:35:00,1E,2023-08-27 23:40:00,1,2023-08-27 23:35:33,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
2023-08-27 23:40:00,1E,2023-08-27 23:45:00,0,2023-08-27 23:41:23,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
2023-08-27 23:45:00,1E,2023-08-27 23:50:00,0,2023-08-27 23:46:03,13.0,9.0,1.0,19,49,12,"[1W, 1E]"
2023-08-27 23:50:00,1E,2023-08-27 23:55:00,0,2023-08-27 23:50:43,13.0,9.0,1.0,19,49,12,"[1W, 1E]"


## 合并车流量

In [98]:
LANE_ARM = [
flow_1N
,flow_1S
,flow_1W
,flow_2E
,flow_2N
,flow_2S
,flow_2W
,flow_3E
,flow_3N
,flow_3S
,flow_3W
,flow_4E
,flow_4N
,flow_4S
,flow_4W]

merged_flow = flow_1E['VOLUMN_5MIN']
merged_flow = pd.DataFrame(merged_flow)
for i in LANE_ARM:
    merged_flow['VOLUMN_5MIN'] = merged_flow['VOLUMN_5MIN']+i['VOLUMN_5MIN']
    
merged_flow

Unnamed: 0_level_0,VOLUMN_5MIN
START_TIME,Unnamed: 1_level_1
2023-08-01 00:05:00,7.0
2023-08-01 00:10:00,11.0
2023-08-01 00:15:00,6.0
2023-08-01 00:20:00,9.0
2023-08-01 00:25:00,2.0
...,...
2023-08-27 23:35:00,14.0
2023-08-27 23:40:00,6.0
2023-08-27 23:45:00,2.0
2023-08-27 23:50:00,8.0


# 特征工程

### 延迟特征


#### 局部特征
现在假设要预测的目标是7550。先抽取局部特征。使用历史数据中最后的7个数据构造特征

In [99]:
target_day = 7550

#使用历史数据中最后的7个构造特征
local_range = 7

# 由于使用前7549个的数据预测第7550个，历史数据与预测目标的距离只有1个单位，因此predict_distance=1
# 如果使用前7549个的数据预测第7550个，则历史数据与预测目标的距离有2个单位，因此predict_distance=2，以此类推
predict_distance = 1

In [100]:
def get_local_features(data,target_day, predict_distance):
    local_features = pd.DataFrame()
    for i in range(local_range):
        selected_data = data.iloc[target_day-predict_distance-i-1,0] #iloc基于行号，列号。注意索引-1
        local_features.loc['la_'+str(i+1),'flow']=selected_data # loc行标签，列标签
    return local_features

get_local_features(merged_flow,target_day, predict_distance)

Unnamed: 0,flow
la_1,11.0
la_2,8.0
la_3,2.0
la_4,6.0
la_5,14.0
la_6,11.0
la_7,12.0


这里我们抽取了七个历史值。
对于历史值的聚合，我们还可以用一个小技巧得到更稳定的特征。
对于单个的历史值，或多或少都有些随机因素，具有较大的不确定性，例如某天天气不好，销量突然下降。
实际上，我们可以用连续几历史数据的加和（或均值），用于减缓不确定性带来的影响。
更具体来说，我们可以用前一个的历史值、前面两个的历史值的和、等等来作为局部特征。
用代码表示的话，即

In [101]:
def get_local_accumulated_features(data,target_day, predict_distance):
    local_accumulated_feature = pd.DataFrame()
    local_accumulated_feature.loc['la_1','flow'] = data.iloc[target_day-predict_distance-1,0]
    for i in range(1,local_range):
        selected_data = data.iloc[target_day-predict_distance-i-1,0] #iloc基于行号，列号。注意索引-1
        local_accumulated_feature.loc['la_'+str(i+1),'flow']=selected_data+local_accumulated_feature.loc['la_'+str(i),'flow'] # loc行标签，列标签
    return local_accumulated_feature

get_local_accumulated_features(merged_flow,target_day, predict_distance)

Unnamed: 0,flow
la_1,11.0
la_2,19.0
la_3,21.0
la_4,27.0
la_5,41.0
la_6,52.0
la_7,64.0


注意到，我们现在只用上了近期的历史数据。还有很多远期的历史数据没用上。
实际上远期的历史数据也是需要的，只不过不需要那么精细，可以做一些聚合。
例如过去14个数据，过去30个数据的总和。为了更快的实现这个，我们先用cumsum 滚动累计每一个历史值。 然后抽取我们需要的数值。

In [102]:
def get_accumulated_features(data,target_day, predict_distance):
    used_history_distances = [1, 2, 3, 4, 5, 6, 7, 8,9,10,14, 21, 28, 42, 56,63,70]

    tx = data[target_day-predict_distance+1-max(used_history_distances)-1:target_day-predict_distance+1][::-1].cumsum(axis=0)
    #tx = tx[::-1]

    local_accumulated_feature = pd.DataFrame()
    for distance in used_history_distances:
        local_accumulated_feature.loc['la_'+str(distance),'flow']=tx.iloc[distance-1,0] # loc行标签，列标签
    return local_accumulated_feature

get_accumulated_features(merged_flow,target_day, predict_distance)

Unnamed: 0,flow
la_1,11.0
la_2,19.0
la_3,21.0
la_4,27.0
la_5,41.0
la_6,52.0
la_7,64.0
la_8,70.0
la_9,81.0
la_10,91.0


这样我们就从历史序列里的最近的70个数据，构造出了上面的17个特征。

#### 周期特征
现在我们来看周期特征。我们主要考虑以天作为周期,即288个历史数据。并且，我们选用288\*7个历史值，也就是过去7天的数据，构造周期特征。因此，我们先取得和目标预测值**同周期**的历史数据。即往前第288\*1个，第288\*2个，第288\*3个.....当时的数据。

In [104]:
def get_period_sale(data,target_day, predict_distance):
    period = 288
    i_start = (predict_distance + period - 1) // period # 表示距离目标日期 predict_distance 天之前的第几个时间段，为1
    period_sale = pd.DataFrame()
    for i in range(7): # 用过去7天的数据
        cur_day = target_day - (i + i_start) * period
        period_sale.loc['p_'+str(i + 1),'flow'] = data.iloc[cur_day,0]
    return period_sale

get_period_sale(merged_flow,target_day, predict_distance)

Unnamed: 0,flow
p_1,10.0
p_2,8.0
p_3,9.0
p_4,8.0
p_5,10.0
p_6,4.0
p_7,6.0


然后，一样的，我们也使用累计的历史值，来提高稳定性。
因此，用cumsum得到累计值

In [105]:
def get_period_features(data,target_day, predict_distance):
    tx_period = get_period_sale(data,target_day, predict_distance)
    tx_period = tx_period.cumsum(axis=0)
    return tx_period

get_period_features(merged_flow,target_day, predict_distance)

Unnamed: 0,flow
p_1,10.0
p_2,18.0
p_3,27.0
p_4,35.0
p_5,45.0
p_6,49.0
p_7,55.0


#### 特征结合

综上，以下是我们基于历史数据构造出的所有特征。

In [106]:
def get_history_features(data,target_day, predict_distance):
    return pd.concat([get_accumulated_features(data,target_day, predict_distance),
                      get_period_features(data,target_day, predict_distance)], axis=0)

get_history_features(merged_flow,target_day, predict_distance)

Unnamed: 0,flow
la_1,11.0
la_2,19.0
la_3,21.0
la_4,27.0
la_5,41.0
la_6,52.0
la_7,64.0
la_8,70.0
la_9,81.0
la_10,91.0


# 构造训练数据

训练集X：窗口大小+构造的特征
训练集y：下一个

In [107]:
import pandas as pd
data = merged_flow.copy()
data = data.rename(columns={'VOLUMN_5MIN': 'flow'})

In [108]:

# 设置时间窗口大小
window_size = 1

In [109]:

# 滑动时间窗口生成样本
samples = []
for i in range(288*7,len(data) - window_size):
    window_data = data.iloc[i:i+window_size] # 
    feature_iwindow = get_history_features(data,i+window_size-1, 1)
    feature = pd.concat([window_data,feature_iwindow], axis=0)
    features = feature['flow'].values # 提取特征
    
    target = data.iloc[i+window_size-1]['flow'] # 提取目标值
    samples.append((features, target))

# 划分训练集和测试集
train_samples = samples[:int(0.8 * len(samples))] # 取80%作为训练集
test_samples = samples[int(0.8 * len(samples)):] # 剩余20%作为测试集

# 构建特征矩阵和目标向量
train_X = [sample[0] for sample in train_samples]
train_y = [sample[1] for sample in train_samples]
test_X = [sample[0] for sample in test_samples]
test_y = [sample[1] for sample in test_samples]


In [110]:
train_X=pd.DataFrame(train_X)

In [111]:
train_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,8.0,10.0,19.0,28.0,32.0,38.0,51.0,58.0,68.0,74.0,...,1242.0,1590.0,2229.0,13.0,27.0,38.0,50.0,50.0,56.0,63.0
1,14.0,8.0,18.0,27.0,36.0,40.0,46.0,59.0,66.0,76.0,...,1209.0,1536.0,2136.0,6.0,15.0,21.0,27.0,27.0,35.0,46.0
2,10.0,14.0,22.0,32.0,41.0,50.0,54.0,60.0,73.0,80.0,...,1185.0,1494.0,2044.0,8.0,22.0,32.0,37.0,37.0,43.0,49.0
3,10.0,10.0,24.0,32.0,42.0,51.0,60.0,64.0,70.0,83.0,...,1163.0,1460.0,1944.0,7.0,20.0,30.0,37.0,37.0,51.0,60.0
4,8.0,10.0,20.0,34.0,42.0,52.0,61.0,70.0,74.0,80.0,...,1145.0,1422.0,1876.0,7.0,16.0,25.0,32.0,32.0,38.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4420,2.0,3.0,6.0,6.0,8.0,10.0,12.0,12.0,13.0,14.0,...,317.0,460.0,668.0,0.0,2.0,4.0,4.0,9.0,12.0,18.0
4421,3.0,2.0,5.0,8.0,8.0,10.0,12.0,14.0,14.0,15.0,...,304.0,442.0,634.0,3.0,4.0,5.0,8.0,11.0,14.0,20.0
4422,1.0,3.0,5.0,8.0,11.0,11.0,13.0,15.0,17.0,17.0,...,296.0,420.0,603.0,1.0,4.0,5.0,7.0,8.0,9.0,11.0
4423,2.0,1.0,4.0,6.0,9.0,12.0,12.0,14.0,16.0,18.0,...,276.0,408.0,564.0,1.0,3.0,5.0,8.0,10.0,13.0,17.0


In [112]:
train_y=pd.DataFrame(train_y)
test_X =pd.DataFrame(test_X)
test_y =pd.DataFrame(test_y)

In [113]:
train_X.info()
train_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4425 entries, 0 to 4424
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4424 non-null   float64
 1   1       4424 non-null   float64
 2   2       4424 non-null   float64
 3   3       4424 non-null   float64
 4   4       4424 non-null   float64
 5   5       4424 non-null   float64
 6   6       4424 non-null   float64
 7   7       4424 non-null   float64
 8   8       4424 non-null   float64
 9   9       4424 non-null   float64
 10  10      4424 non-null   float64
 11  11      4424 non-null   float64
 12  12      4424 non-null   float64
 13  13      4424 non-null   float64
 14  14      4424 non-null   float64
 15  15      4424 non-null   float64
 16  16      4424 non-null   float64
 17  17      4424 non-null   float64
 18  18      4424 non-null   float64
 19  19      4424 non-null   float64
 20  20      4424 non-null   float64
 21  21      4424 non-null   float64
 22  

### 调参和训练

首先，导入LightGBM的python包。

接着，创建训练数据和测试数据，代码如下。导入数据集之后，LightGBM会根据超参数，在数据集中加入一些额外的结构信息，例如，哪些特征属于类别特征、特征值离散化的边界等等。创建test_set时，我们需要设置reference=train_set，这使得test_set的结构信息与train_set保持一致。

`feature_pre_filter`默认为`True`，lightgbm会根据min_data_in_leaf的值提前把一些不可能找到合法分割的特征过滤。由于后面会调整min_data_in_leaf，我们不希望反复构造数据集，我们将设置`feature_pre_filter=False`，这样`min_data_in_leaf`的选取不会影响到Dataset的构建。

In [114]:

#train_set = lgb.Dataset(train_X, label=train_y)
#test_set = lgb.Dataset(test_X, label=test_y)

In [115]:
params = {
    "feature_pre_filter": False
}

train_set = lgb.Dataset(train_X, label=train_y, params=params)
test_set = lgb.Dataset(test_X, label=test_y, reference=train_set)

首先使用一组默认的超参数训练一下，观察在测试集上的效果。

In [116]:
params = {
    'objective':'regression' #设置目标函数为regression，将会使用最小均方误差(MSE)作为目标函数。其他超参数保持默认。
}
model = lgb.train(params=params, train_set=train_set, valid_sets=[test_set], valid_names=["test"])

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6033
[LightGBM] [Info] Number of data points in the train set: 4425, number of used features: 25
[LightGBM] [Info] Start training from score 47.997740




#### 手动调参

调参时，将train_set中最后一天288个数据作为验证集，其余作为训练集。

In [118]:
num_data_per_day = 288
val_train_data = train_X.iloc[: -num_data_per_day]
val_test_data = train_X.iloc[-num_data_per_day :]
val_train_label = train_y[: -num_data_per_day]
val_test_label = test_y[-num_data_per_day :]

val_train_set = lgb.Dataset(data=val_train_data, label=val_train_label)
val_test_set = lgb.Dataset(data=val_test_data, label=val_test_label, reference=val_train_set)

In [119]:
params = {
    'objective': 'regression',
    'num_trees': 1000
}
early_stopping_rounds = 10

为方便测试，先定义一个直接由超参数得到验证集上结果的函数。

In [120]:
def get_eval_result(params):
    evals_result = {}
    model = lgb.train(params=params,
                      train_set=val_train_set,
                      valid_sets=[val_test_set],
                      valid_names=["val_test"],
                      #verbose_eval=False
                      callbacks=[lgb.log_evaluation(period=100), lgb.early_stopping(stopping_rounds=early_stopping_rounds),lgb.record_evaluation(evals_result)]
                      )
    return evals_result["val_test"]["l2"][model.best_iteration - 1], model.best_iteration

首先，固定学习率和最大迭代次数，调整其他超参数。为了保证能够得到测试集上最优迭代的结果，一开始固定的迭代次数会比较大，这里选取了num_trees=500。这里为了方便展示，我们仅以grid search的方式调整决策树的规模和叶子上最少允许的数据量。读者可自行加入其他超参数一起搜索。

In [121]:
num_leaves_options = [4, 8, 16, 32, 64, 128, 256]
min_data_in_leaf_options = [20, 50, 100, 200, 300, 400, 500, 1000]
for num_leaves in num_leaves_options:
    for min_data_in_leaf in min_data_in_leaf_options:
        try_params = params.copy()
        try_params.update({'num_leaves': num_leaves, 'min_data_in_leaf': min_data_in_leaf})
        l2, best_iteration = get_eval_result(try_params)
        print("best l2 loss %.6f at iteration %d with num_leaves = %d and min_data_in_leaf = %d" % 
              (l2, best_iteration, num_leaves, min_data_in_leaf))



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info] Number of data points in the train set: 4137, number of used features: 25
[LightGBM] [Info] Start training from score 48.073000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[3]	val_test's l2: 1288.2
best l2 loss 1288.196288 at iteration 3 with num_leaves = 4 and min_data_in_leaf = 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info] Number of data points in the train set: 4137, number of used features: 25
[LightGBM] [Info] Start training from score 48.073000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[3]	val_test's l2: 1288.2
best l2 loss 1288.196288 at iteration 3 with num_leaves = 4 and min_data_in_leaf = 50
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info

Early stopping, best iteration is:
[4]	val_test's l2: 1293.73
best l2 loss 1293.725350 at iteration 4 with num_leaves = 256 and min_data_in_leaf = 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info] Number of data points in the train set: 4137, number of used features: 25
[LightGBM] [Info] Start training from score 48.073000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[4]	val_test's l2: 1294.78
best l2 loss 1294.780765 at iteration 4 with num_leaves = 256 and min_data_in_leaf = 50
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info] Number of data points in the train set: 4137, number of used features: 25
[LightGBM] [Info] Start training from score 48.073000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[3]	val_test's l2: 1298.66
best l2 loss 1298.661524 at iteration 3 

我们发现，num_leaves=4并且min_data_in_leaf=20的时候结果是最好的。接下来我们调整学习率。

In [122]:
params.update({'num_leaves': 4, 'min_data_in_leaf':400})
learning_rate_options = [0.01, 0.02, 0.03, 0.05, 0.1]
for learning_rate in learning_rate_options:
    try_params = params.copy()
    try_params.update({'learning_rate': learning_rate})
    l2, best_iteration = get_eval_result(try_params)
    print("best l2 loss %.6f at iteration %d with learning_rate = %f" % (l2, best_iteration, learning_rate))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info] Number of data points in the train set: 4137, number of used features: 25
[LightGBM] [Info] Start training from score 48.073000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	val_test's l2: 1289.6
best l2 loss 1289.601534 at iteration 37 with learning_rate = 0.010000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info] Number of data points in the train set: 4137, number of used features: 25
[LightGBM] [Info] Start training from score 48.073000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[19]	val_test's l2: 1288.93
best l2 loss 1288.925424 at iteration 19 with learning_rate = 0.020000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6032
[LightGBM] [Info] Number of data points in 

可见learning_rate=0.1且迭代次数为4是最好的。我们使用搜索出的最优超参，在全部训练集上进行训练，并观察测试集的结果

In [123]:
# 定义评价指标 SMAPE 函数
ep = 0.0000000001
def smape(y_pred, y_true):
    return np.mean(np.abs(y_pred - y_true) / (  ( np.abs(y_pred) + np.abs(y_true) ) / 2  ) + ep )

In [124]:
params.update({'learning_rate':0.1, 'num_trees':3})
model = lgb.train(params=params,
                      train_set=train_set)
score = model.predict(test_X)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6033
[LightGBM] [Info] Number of data points in the train set: 4425, number of used features: 25
[LightGBM] [Info] Start training from score 47.997740


In [125]:
score

array([36.89403073, 36.89403073, 36.89403073, ..., 36.89403073,
       36.89403073, 36.89403073])

In [126]:
test_y

Unnamed: 0,0
0,2.0
1,3.0
2,2.0
3,3.0
4,2.0
...,...
1102,11.0
1103,14.0
1104,6.0
1105,2.0


In [127]:
test_y = pd.Series(test_y[0])

In [128]:
l2_loss = np.mean((score - test_y) ** 2)
l2_smape_loss = smape(score,test_y)


In [129]:
print("l2_loss:",l2_loss)
print("l2_smape_loss:",l2_smape_loss)

l2_loss: 727.0448864096091
l2_smape_loss: 0.6389564749310152
