In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# 加载数据
data_path = "./merged_output.csv"  # 替换为实际文件路径
data = pd.read_csv(data_path)

# 确保数据类型正确
data["datetime_x"] = pd.to_datetime(data['datetime_x'], format='%Y/%m/%d %H:%M', errors='coerce')
data["datetime_x"] = data['datetime_x'].astype('int64') // 10**9  # 转换为秒级时间戳

# 创建二分类标签：延误为 1，否则为 0
data["delayed"] = (data["arrival_delay"] > 0).astype(int)

# 添加滞后特征
data["lag_arrival_delay"] = data.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")["arrival_delay"].shift(1)
data["lag_departure_delay"] = data.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")["departure_delay"].shift(1)

data = data.dropna(subset=["lag_arrival_delay", "lag_departure_delay"])

# 定义输入特征和目标变量
input_columns = [
    "stop_id", "datetime_x", "temperature", "precipitation", "snowfall",
    "snow_depth", "wind_speed", "cloud_cover", "lag_arrival_delay", "lag_departure_delay"
]
X = data[input_columns]
y = data["delayed"]

# 对类别型特征 stop_id 进行独热编码
X = pd.get_dummies(X, columns=["stop_id"], drop_first=True)

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
data.head()

Unnamed: 0,id,trip_id,stop_sequence,stop_id,arrival_delay,arrival_time,departure_delay,departure_time,datetime_x,date_hour,...,group_id,temperature,precipitation,snowfall,snow_depth,wind_speed,cloud_cover,delayed,lag_arrival_delay,lag_departure_delay
1,14010514476255669,1.401e+16,17,9022001006071004,-86.0,2022-12-02 22:48:34,6.0,2022-12-02 22:50:06,-9223372037,2022-12-02 22,...,90.0,-0.1,0.0,0.0,0.01,11.5,100.0,0,654.0,731.0
2,14010514538271681,1.401e+16,40,9022001041437001,145.0,2022-12-02 22:50:01,145.0,2022-12-02 22:50:01,-9223372037,2022-12-02 22,...,267.0,1.0,0.0,0.0,0.0,18.9,98.0,1,544.0,544.0
5,14010514402573013,1.401e+16,18,9022001004507002,110.0,2022-12-02 22:49:20,150.0,2022-12-02 22:50:00,-9223372037,2022-12-02 22,...,90.0,-0.1,0.0,0.0,0.01,11.5,100.0,1,238.0,281.0
6,14010514492176186,1.401e+16,19,9022001015311001,130.0,2022-12-02 22:49:48,139.0,2022-12-02 22:49:57,-9223372037,2022-12-02 22,...,209.0,-0.4,0.0,0.0,0.01,12.1,100.0,1,394.0,394.0
10,14010514528605893,1.401e+16,31,9022001063953001,209.0,2022-12-02 22:50:03,209.0,2022-12-02 22:50:03,-9223372037,2022-12-02 22,...,494.0,-1.2,0.0,0.0,0.01,11.2,91.0,1,238.0,238.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 226529 entries, 1 to 232745
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   226529 non-null  int64  
 1   trip_id              226529 non-null  float64
 2   stop_sequence        226529 non-null  int64  
 3   stop_id              226529 non-null  int64  
 4   arrival_delay        226223 non-null  float64
 5   arrival_time         226223 non-null  object 
 6   departure_delay      226212 non-null  float64
 7   departure_time       226212 non-null  object 
 8   datetime_x           226529 non-null  int64  
 9   date_hour            226529 non-null  object 
 10  stop_name            226472 non-null  object 
 11  stop_lat             226472 non-null  float64
 12  stop_lon             226472 non-null  float64
 13  group_id             226472 non-null  float64
 14  temperature          226472 non-null  float64
 15  precipitation        2

In [4]:
unique_count = data["trip_id"].nunique()
print(f"该列有 {unique_count} 种唯一值")

该列有 2749 种唯一值


In [5]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'  # 使用 logloss 作为评估指标
)
xgb_model.fit(X_train, y_train)

# 预测
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

Parameters: { "use_label_encoder" } are not used.



In [6]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# 计算并输出评估指标
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.80
ROC-AUC: 0.84

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.62      0.67     14696
           1       0.83      0.89      0.86     30610

    accuracy                           0.80     45306
   macro avg       0.78      0.75      0.77     45306
weighted avg       0.80      0.80      0.80     45306



In [10]:
# 使用训练好的模型进行预测
y_pred = xgb_model.predict(X_test)  # 预测结果（0 或 1）
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]  # 预测延误的概率


# 输出前 10 个预测结果、预测概率和真实值
print("Predicted Results (first 10):", y_pred[:10])
print("Predicted Probabilities (first 10):", y_pred_proba[:10])
print("Actual Values (first 10):", y_test[:10].values)  # 转换为 NumPy 数组输出


Predicted Results (first 10): [1 0 0 1 1 0 0 1 1 1]
Predicted Probabilities (first 10): [0.7881701  0.41524538 0.33577976 0.9301664  0.78095466 0.44782668
 0.12477484 0.89268345 0.8834838  0.9189533 ]
Actual Values (first 10): [1 0 0 1 1 0 0 1 1 1]


In [8]:
import matplotlib.pyplot as plt
from xgboost import plot_importance

# 获取特征重要性并排序
importance = xgb_model.get_booster().get_score(importance_type='weight')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)

# 打印前 10 个重要特征
print("Top 10 Important Features:")
for feature, score in sorted_importance[:10]:
    print(f"{feature}: {score}")


Top 10 Important Features:
lag_arrival_delay: 634.0
lag_departure_delay: 529.0
temperature: 483.0
wind_speed: 378.0
cloud_cover: 285.0
snow_depth: 104.0
precipitation: 22.0
stop_id_9022001044000019: 18.0
snowfall: 16.0
stop_id_9022001004503001: 16.0


In [9]:
# 别跑内存会爆
# from sklearn.model_selection import GridSearchCV

# # 定义参数网格
# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [4, 6, 8],
#     'learning_rate': [0.01, 0.1, 0.2]
# }

# # 使用网格搜索和交叉验证
# grid_search = GridSearchCV(
#     estimator=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
#     param_grid=param_grid,
#     scoring='roc_auc',
#     cv=3,
#     verbose=1,
#     n_jobs=-1
# )

# grid_search.fit(X_train, y_train)

# # 输出最佳参数和得分
# print("Best Parameters:", grid_search.best_params_)
# print("Best ROC-AUC Score:", grid_search.best_score_)
