## 下一跳预测

In [None]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [None]:
def process_data(file_path, output_path):
    # 读取数据
    df = pd.read_csv(file_path)

    # 1. 将时间time转为基于最早时间的偏移time_offset
    df['time'] = pd.to_datetime(df['time'])
    base_time = df['time'].min()
    df['time_offset'] = (df['time'] - base_time).dt.total_seconds()

    # 2. 将coordinates列转换为经度和纬度两列
    df[['longitude', 'latitude']] = df['coordinates'].apply(lambda x: eval(x) if pd.notna(x) else [None, None]).tolist()

    df['holiday'] = df['time'].apply(lambda x: 1 if (x.month == 10 and 1 <= x.day <= 7) else 0)

    weather_df = pd.read_csv('./data/weather.csv')

    # 确保日期格式一致
    weather_df['Date'] = pd.to_datetime(weather_df['Date'])
    df['date'] = df['time'].dt.date
    df['date'] = pd.to_datetime(df['date'])

    # 合并轨迹数据和天气数据
    df = pd.merge(df, weather_df, left_on='date', right_on='Date', how='left')

    # 删除不需要的列
    df.drop(columns=['Date','Day','date'], inplace=True)

    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    # 保存处理后的数据
    df.to_csv(output_path, index=False)
    
process_data('./data/traj.csv', './product_data/task4_train_data.csv')
process_data('./data/jump_task.csv', './product_data/task4_todo.csv')

In [None]:
seed = 114514

np.random.seed(seed)
torch.manual_seed(seed)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

In [None]:
class BiLSTMPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTMPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        # Bi-LSTM 需要两个隐藏状态
        h0 = torch.zeros(2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(2, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [None]:
from transformers import BertModel, BertConfig

config = BertConfig.from_pretrained("bert-base-uncased")
bert_model = BertModel(config)

class BertPredictor(nn.Module):
    def __init__(self, pretrained_model, output_dim, input_dim=2, embedding_dim=768):
        super(BertPredictor, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)  # 输入维度为2（[x, y]）

        self.encoder = pretrained_model
        self.fc = nn.Linear(self.encoder.config.hidden_size, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        outputs = self.encoder(inputs_embeds=embedded).last_hidden_state
        prediction = self.fc(outputs[:, -1, :])  
        return prediction

In [None]:
def createSequence(df, features, window_size):
    # 找到轨迹长度 >= 15的所有轨迹
    traj_index_list = df.groupby(['trajectory_id']).size()        # Series
    traj_index_list = traj_index_list[traj_index_list > 15]

    # 按照滑动窗口进行划分
    seq = []
    label = []
    for index, _ in traj_index_list.items():
        trajectory_id = index
        trajectory = df[(df['trajectory_id'] == trajectory_id)][features].values.tolist()
        num_splits = len(trajectory) - window_size + 1
        for i in range(num_splits):
            seq.append(trajectory[i:i + window_size - 1])
            label.append(trajectory[i + window_size - 1])
    seq = torch.tensor(np.array(seq), dtype=torch.float32).to(device)
    label = torch.tensor(np.array(label), dtype=torch.float32).to(device)
    return seq, label

In [None]:
def calc_rmse(predictions, targets):
    mse = torch.mean((predictions[:, :3] - targets[:, :3]) ** 2)
    rmse = torch.sqrt(mse)
    return rmse

In [None]:
def trainModel(train_X, train_Y, val_X, val_Y, model,
               lr=1e-2, epoch_num=20, logging_steps=5):
    # 使用 DataLoader 和 TensorDataset 批量加载数据
    train_set = TensorDataset(train_X, train_Y)
    train_loader = DataLoader(train_set, batch_size=32, shuffle=True, generator=torch.Generator(device=device))
    test_set = TensorDataset(val_X, val_Y)
    test_loader = DataLoader(test_set, batch_size=32, shuffle=False, generator=torch.Generator(device=device))

    # 初始化损失函数和优化器
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    # 模型训练
    for epoch in tqdm(range(epoch_num)):
        loss = 0.0
        index = 0
        for data in train_loader:
            
            inputs, labels = data
            optimizer.zero_grad()
            
            if index % 150 == 0:
                print(f"iter: {index}")
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            loss += loss.item()
            index += 1
            
        # log
        if epoch % logging_steps == (logging_steps - 1):
            print(f"Epoch {epoch + 1}, loss: {loss / len(train_loader):.6f}")

    # 用验证集评估
    preds = []
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            outputs = model(inputs)
            preds.append(outputs)
    
    preds = torch.cat(preds, dim=0)
    loss = criterion(preds, val_Y)
    rmse = calc_rmse(preds, val_Y)

    return model, loss.item(), rmse.item()

### 训练

In [None]:
# 全部特征组合
all_features = [
    ['longitude', 'latitude', 'holiday', 'High Temp', 'Low Temp', 'Rain', 'Wind Force'],
    ['longitude', 'latitude'],
]

# 全部可选模型
all_model_types = [
    TrajectoryPredictor,
    BiLSTMPredictor,
]

In [None]:
# 划分训练集：验证集：测试集 == 6:2:2
def split_dataset(df, features, window_size):
    seq, label = createSequence(df, features, window_size)

    # train
    train_seq, test_seq = train_test_split(seq, test_size=0.4, random_state=seed)
    train_label, test_label = train_test_split(label, test_size=0.4, random_state=seed)

    # val & test
    val_seq, test_seq = train_test_split(test_seq, test_size=0.5, random_state=seed)
    val_label, test_label = train_test_split(test_label, test_size=0.5, random_state=seed)

    return train_seq, train_label, val_seq, val_label, test_seq, test_label

In [None]:
window_size = 15
lr = 1e-2
epoch_num = 200
logging_steps = 40

In [None]:
# 尝试多种模型与特征
best_model = None
best_features = []
min_rmse = 1e+5

df = pd.read_csv('./product_data/task4_train_data.csv')
# 遍历特征
for features in all_features:
    train_seq, train_label, val_seq, val_label, test_seq, test_label = split_dataset(df, features, window_size)
    # 遍历模型
    for i in range(len(all_model_types)):
        if i == 0:
            model = BertPredictor(bert_model, output_dim=2)
        elif i == 1:
            model = BiLSTMPredictor(train_seq.shape[2], 108, train_label.shape[1]).to(device)
        
        print(f'current features: {features}', flush=True)
        print(f'current model_type: {type(model)}', flush=True)


        model, loss, rmse = trainModel(train_seq, train_label, val_seq, val_label, model, lr, epoch_num, logging_steps)
        # 在（划分的）测试集上测试
        predictions = []
        test_set = TensorDataset(test_seq, test_label)
        test_loader = DataLoader(
            test_set, batch_size=32, shuffle=False, generator=torch.Generator(device=device))

        with torch.no_grad():
            for data in test_loader:
                inputs, labels = data
                outputs = model(inputs)
                predictions.append(outputs)

        predictions = torch.cat(predictions, dim=0)
        test_rmse = calc_rmse(predictions, test_label)
        print(f'test rmse: {test_rmse:.5f}')
        print(f'=' * 80, end='\n\n', flush=True)

        # 记录最好的模型
        if test_rmse < min_rmse:
            min_rmse = test_rmse
            best_model = model
            best_features = features