# 1 导包和数据读取

In [1]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns

# modelling
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler

In [2]:
#load_dataset
lanes_df = pd.read_csv('./data/Lane.csv')  
light_df = pd.read_csv('./data/Light_status.csv')
roads_df = pd.read_csv('./data/Entrance_road.csv') 
flow_df = pd.read_csv('./data/Flow.csv')

# 2 处理数据

## Lane.csv

In [3]:
lanes_df.head()

Unnamed: 0,LANE_ID,ARM_ID,LANE_NAME,DIR
0,1,E,左,L
1,1,N,左,L
2,1,S,左,L
3,1,W,左,L
4,2,E,直,S


In [4]:
# Process Lane.csv
lanes_df = lanes_df[['LANE_ID', 'ARM_ID', 'LANE_NAME', 'DIR']] 

In [5]:
lanes_df.head()

Unnamed: 0,LANE_ID,ARM_ID,LANE_NAME,DIR
0,1,E,左,L
1,1,N,左,L
2,1,S,左,L
3,1,W,左,L
4,2,E,直,S


In [6]:
lanes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   LANE_ID    16 non-null     int64 
 1   ARM_ID     16 non-null     object
 2   LANE_NAME  16 non-null     object
 3   DIR        16 non-null     object
dtypes: int64(1), object(3)
memory usage: 640.0+ bytes


## Light_status.csv

In [7]:
light_df = pd.read_csv('./data/Light_status.csv')
light_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105844 entries, 0 to 105843
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   CYCLE_START_TIME  105844 non-null  object
 1   STAGE_START_TIME  105844 non-null  object
 2   STAGE_END_TIME    105844 non-null  object
 3   STAGE_LENGTH      105844 non-null  int64 
 4   GREEN_TIME        105844 non-null  int64 
 5   GREEN_FLASH_TIME  105844 non-null  int64 
 6   YELLOW_TIME       105844 non-null  int64 
 7   ALL_RED_TIME      105844 non-null  int64 
 8   CHANNELS          105844 non-null  object
 9   LANES             105844 non-null  object
 10  PHASES            105844 non-null  object
 11  LANE_FUNCS        105844 non-null  object
dtypes: int64(5), object(7)
memory usage: 9.7+ MB


In [8]:
# 转换成日期属性
light_df['CYCLE_START_TIME'] = pd.to_datetime(light_df['CYCLE_START_TIME'])
light_df['STAGE_START_TIME'] = pd.to_datetime(light_df['STAGE_START_TIME'])
light_df['STAGE_END_TIME'] = pd.to_datetime(light_df['STAGE_END_TIME'])

In [19]:
# 删除GREEN_FLASH_TIME YELLOW_TIME列
light_df.drop('GREEN_FLASH_TIME',axis=1,inplace=True)
light_df.drop('YELLOW_TIME',axis=1,inplace=True)
light_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105844 entries, 0 to 105843
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   CYCLE_START_TIME  105844 non-null  datetime64[ns]
 1   STAGE_START_TIME  105844 non-null  datetime64[ns]
 2   STAGE_END_TIME    105844 non-null  datetime64[ns]
 3   STAGE_LENGTH      105844 non-null  int64         
 4   GREEN_TIME        105844 non-null  int64         
 5   YELLOW_TIME       105844 non-null  int64         
 6   ALL_RED_TIME      105844 non-null  int64         
 7   CHANNELS          105844 non-null  object        
 8   LANES             105844 non-null  object        
 9   PHASES            105844 non-null  object        
 10  LANE_FUNCS        105844 non-null  object        
 11  released_lanes    105844 non-null  object        
dtypes: datetime64[ns](3), int64(4), object(5)
memory usage: 9.7+ MB


In [9]:
# 处理LANES列
light_df['released_lanes'] = light_df['LANES'].str.split(',') # 用，分割

In [10]:
# 将放行车道号“_”前代表的进口道，1，2，3，4替换为为W，N，E，S。

# 定义一个替换函数 replace_dir
def replace_lanes(lanes):
    replaced = []
    for i in lanes:
        i = i.replace('1_', 'W_')
        i = i.replace('2_', 'N_')
        i = i.replace('3_', 'E_')
        i = i.replace('4_', 'S_')
        
        replaced.append(i)
    
    return replaced

# apply替换
light_df['released_lanes'] = light_df['released_lanes'].apply(replace_lanes)

In [11]:
light_df['released_lanes']

0         [W_2, W_3, E_2, W_4, E_3, E_4]
1         [W_2, W_3, E_2, W_4, E_3, E_4]
2         [W_2, W_3, E_2, W_4, E_3, E_4]
3         [W_2, W_3, E_2, W_4, E_3, E_4]
4         [W_2, W_3, E_2, W_4, E_3, E_4]
                       ...              
105839                        [N_1, S_1]
105840                        [N_1, S_1]
105841                        [N_1, S_1]
105842                        [N_1, S_1]
105843                        [N_1, S_1]
Name: released_lanes, Length: 105844, dtype: object

In [12]:
# 调换顺序，前面为车道号，后面为进口道，并去掉下划线
light_df['released_lanes'] = light_df['released_lanes'].apply(lambda lanes: [lane.replace('_', '')[-1] + lane.replace('_', '')[:-1] for lane in lanes])

In [13]:
light_df.head()

Unnamed: 0,CYCLE_START_TIME,STAGE_START_TIME,STAGE_END_TIME,STAGE_LENGTH,GREEN_TIME,GREEN_FLASH_TIME,YELLOW_TIME,ALL_RED_TIME,CHANNELS,LANES,PHASES,LANE_FUNCS,released_lanes
0,2023-08-01 00:01:08,2023-08-01 00:01:08,2023-08-01 00:01:30,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113,"[2W, 3W, 2E, 4W, 3E, 4E]"
1,2023-08-01 00:02:18,2023-08-01 00:02:18,2023-08-01 00:02:40,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113,"[2W, 3W, 2E, 4W, 3E, 4E]"
2,2023-08-01 00:04:38,2023-08-01 00:04:38,2023-08-01 00:05:00,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113,"[2W, 3W, 2E, 4W, 3E, 4E]"
3,2023-08-01 00:05:48,2023-08-01 00:05:48,2023-08-01 00:06:10,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113,"[2W, 3W, 2E, 4W, 3E, 4E]"
4,2023-08-01 00:06:58,2023-08-01 00:06:58,2023-08-01 00:07:20,22,19,0,3,0,112310,"1_2,1_3,3_2,1_4,3_3,3_4",3510,1113,"[2W, 3W, 2E, 4W, 3E, 4E]"


## Entrance_road.csv

In [14]:
roads_df.head()

Unnamed: 0,ARM_ID,ARM_NAME,DIR
0,E,东进口道,E
1,N,北进口道,N
2,S,南进口道,S
3,W,西进口道,W


In [15]:
# Process Entrance_road.csv 
roads_df = roads_df[['ARM_ID', 'ARM_NAME']]
roads_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ARM_ID    4 non-null      object
 1   ARM_NAME  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


## Flow.csv

In [16]:
flow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120780 entries, 0 to 120779
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   LANE_ID      120780 non-null  int64 
 1   ARM_ID       120780 non-null  object
 2   START_TIME   120780 non-null  object
 3   END_TIME     120780 non-null  object
 4   VOLUMN_5MIN  120780 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 4.6+ MB


In [17]:
flow_df['START_TIME'] = pd.to_datetime(flow_df['START_TIME'])
flow_df['END_TIME'] = pd.to_datetime(flow_df['END_TIME'])

In [18]:
flow_df.head()

Unnamed: 0,LANE_ID,ARM_ID,START_TIME,END_TIME,VOLUMN_5MIN
0,1,E,2023-08-01 00:05:00,2023-08-01 00:10:00,2
1,1,E,2023-08-01 00:10:00,2023-08-01 00:15:00,1
2,1,E,2023-08-01 00:15:00,2023-08-01 00:20:00,0
3,1,E,2023-08-01 00:20:00,2023-08-01 00:25:00,1
4,1,E,2023-08-01 00:25:00,2023-08-01 00:30:00,0


## 合并数据

In [34]:
# Merge the dataframes 
#df = flow_df.merge(lanes_df, on='LANE_ID')  
#df = df.merge(light_df, on='time_start') 
#df = df.merge(roads_df, on='ARM_ID')
merged_df = pd.merge(lanes_df, flow_df, on=['LANE_ID', 'ARM_ID'])

In [35]:
merged_df.head(100000)

Unnamed: 0,LANE_ID,ARM_ID,LANE_NAME,DIR,START_TIME,END_TIME,VOLUMN_5MIN
0,1,E,左,L,2023-08-01 00:05:00,2023-08-01 00:10:00,2
1,1,E,左,L,2023-08-01 00:10:00,2023-08-01 00:15:00,1
2,1,E,左,L,2023-08-01 00:15:00,2023-08-01 00:20:00,0
3,1,E,左,L,2023-08-01 00:20:00,2023-08-01 00:25:00,1
4,1,E,左,L,2023-08-01 00:25:00,2023-08-01 00:30:00,0
...,...,...,...,...,...,...,...
99995,4,N,右,R,2023-08-07 11:15:00,2023-08-07 11:20:00,1
99996,4,N,右,R,2023-08-07 11:20:00,2023-08-07 11:25:00,2
99997,4,N,右,R,2023-08-07 11:25:00,2023-08-07 11:30:00,2
99998,4,N,右,R,2023-08-07 11:30:00,2023-08-07 11:35:00,2


# 3 初步训练模型

In [None]:
# Data preprocessing

# Load and process 4 CSVs (code from previous examples) 

# Train/val/test split
VAL_PCT = 0.2 # validation percent
TEST_PCT = 0.2 # test percent

val_size = int(len(df) * VAL_PCT)
test_size = int(len(df) * TEST_PCT)

df_train = df[:-(val_size+test_size)] 
df_val = df[-(val_size+test_size):-test_size]
df_test = df[-test_size:]

# Scale target column
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(df_train['traffic_volume'].values.reshape(-1,1))

df_train['traffic_volume'] = scaler.transform(df_train['traffic_volume'].values.reshape(-1,1))
df_val['traffic_volume'] = scaler.transform(df_val['traffic_volume'].values.reshape(-1,1)) 
df_test['traffic_volume'] = scaler.transform(df_test['traffic_volume'].values.reshape(-1,1))

# Create sequences
SEQ_LEN = 12 # sequence length

X_train, y_train = [], []
for i in range(SEQ_LEN, len(df_train)):
    X_train.append(df_train.iloc[i-SEQ_LEN:i])
    y_train.append(df_train.iloc[i]['traffic_volume'])
    
X_val, y_val = [], [] 
# similarly for df_val

X_test, y_test = [], []
# similarly for df_test

# Build RNN model
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(64, input_shape=(X_train[0].shape[0], X_train[0].shape[1])))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

# Train model 
model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val)) 

# Evaluate on test set
y_pred = model.predict(X_test)
# Inverse transform predictions
y_pred = scaler.inverse_transform(y_pred) 

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print('Test MSE:', mse)