<h3>导入包</h3>

In [74]:
import os
import re
import logging
from datetime import datetime, date
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from xgboost import DMatrix
import lightgbm as lgb
from lightgbm import Dataset
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

<h3>读取数据</h3>

In [75]:
train_rain = pd.read_csv("../data/A榜/train/A-雨量水位（2014-2019）.csv")
train_label = pd.read_csv("../data/A榜/train/A-入库流量（2014-2019）.csv")
test_train = pd.read_csv("../data/A榜/test/A-雨量水位（2020-2021）.csv")
logging.info(f"训练集雨量水位数据{train_rain.shape}")
logging.info(f"训练集入库流量数据{train_label.shape}")
logging.info(f"测试集雨量水位数据{test_train.shape}")

2025-03-04 10:11:16,852 : INFO : 雨量水位数据(3252327, 14)
2025-03-04 10:11:16,852 : INFO : 入库流量数据(52561, 14)


<h3>数据预处理</h3>

<h4>时间预处理</h4>

In [76]:
train_rain["TIME"] = pd.to_datetime(train_rain["TIME"])
train_rain["year"] = train_rain["TIME"].dt.year
train_rain["month"] = train_rain["TIME"].dt.month
train_rain["day"] = train_rain["TIME"].dt.day
train_rain["hour"] = train_rain["TIME"].dt.hour
train_rain["minute"] = train_rain["TIME"].dt.minute
train_rain["second"] = train_rain["TIME"].dt.second
train_rain = train_rain.sort_values(["SENID", "TIME"])

train_label["TIME"] = pd.to_datetime(train_label["TIME"])
train_label["year"] = train_label["TIME"].dt.year
train_label["month"] = train_label["TIME"].dt.month
train_label["day"] = train_label["TIME"].dt.day
train_label["hour"] = train_label["TIME"].dt.hour
train_label["minute"] = train_label["TIME"].dt.minute
train_label["second"] = train_label["TIME"].dt.second
train_label = train_label.sort_values("TIME")

test_train["TIME"] = pd.to_datetime(test_train["TIME"])
test_train = test_train.sort_values("TIME")

<h4>处理异常值</h4>

In [77]:
train_rain[train_rain[["V", "AVGV", "MAXV", "MINV", "S", "AVGS", "MAXS", "MINS"]] < 0] = np.nan
train_label[train_label[["V", "AVGV", "MAXV", "MINV", "S", "AVGS", "MAXS", "MINS"]] < 0] = np.nan
test_train[test_rain[["V", "AVGV", "MAXV", "MINV", "S", "AVGS", "MAXS", "MINS"]] < 0] = np.nan

<h3>特征工程</h3>

<h4>不同站点横向拼接</h4>

In [78]:
train_rain_group_by_senids = []
for senid, df in tqdm(train_rain.groupby("SENID")):
    df = pd.merge(df, train_label[["TIME"]], how="outer", on="TIME").sort_values("TIME")
    df = df.set_index("TIME", drop=True)
    df = df.drop(columns=["NAME", "SENID", "MAXT", "MINT", "year", "month", "day", "hour", "minute", "second"])
    df = df.interpolate(method="time")
    df = df.reindex(train_label["TIME"])
    df = df.rename(columns={k: f"{k}_{senid}" for k in df.columns})
    train_rain_group_by_senids.append(df)

  0%|          | 0/109 [00:00<?, ?it/s]

In [79]:
train_rain_group_by_senid = pd.concat(train_rain_group_by_senids, axis=1)
train_rain_group_by_senid

Unnamed: 0_level_0,V_210254,AVGV_210254,MAXV_210254,MINV_210254,S_210254,AVGS_210254,MAXS_210254,MINS_210254,SPAN_210254,V_210454,...,SPAN_1510454,V_1610254,AVGV_1610254,MAXV_1610254,MINV_1610254,S_1610254,AVGS_1610254,MAXS_1610254,MINS_1610254,SPAN_1610254
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01 00:00:00,,,,,,,,,,,...,,0.0,,,,2.0,2.0,2.0,2.0,1.0
2014-01-01 01:00:00,,,,,,,,,,,...,,0.0,,,,2.0,2.0,2.0,2.0,1.0
2014-01-01 02:00:00,,,,,,,,,,,...,,0.0,,,,2.0,2.0,2.0,2.0,1.0
2014-01-01 03:00:00,,,,,,,,,,,...,,0.0,,,,2.0,2.0,2.0,2.0,1.0
2014-01-01 04:00:00,,,,,,,,,,,...,,0.0,,,,2.0,2.0,2.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 19:00:00,0.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,0.0,...,1.0,0.0,2.0,1.0,0.0,2.0,2.0,2.0,2.0,1.0
2019-12-31 20:00:00,0.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,0.0,...,1.0,0.0,2.0,1.0,0.0,2.0,2.0,2.0,2.0,1.0
2019-12-31 21:00:00,0.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,0.0,...,1.0,0.0,2.0,1.0,0.0,2.0,2.0,2.0,2.0,1.0
2019-12-31 22:00:00,0.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,0.0,...,1.0,0.0,2.0,1.0,0.0,2.0,2.0,2.0,2.0,1.0


<h4>整理标签</h4>

In [80]:
train_label = train_label.set_index("TIME", drop=True)

<h4>合并标签和特征</h4>

In [81]:
train_data = pd.merge(train_rain_group_by_senid, train_label[["V"]], how="inner", left_index=True, right_index=True)

<h3>训练模型</h3>

<h4>评测指标</h4>

In [82]:
def score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return 1 / (1 + rmse)

<h4>lightgbm模型</h4>

In [83]:
params_lgb = {
    "num_boost_round": 1000,
    'learning_rate': 0.02,
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'metric': 'rmse',
    'num_leaves': 127,
    'verbose': -1,
    'seed': 42,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
    "early_stopping_round": 100
}
model_lgb = []

<h4>xgboost模型</h4>

In [84]:
params_xgb = {
    "num_boost_round": 500,
    "learning_rate": 0.02,
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_leaves": 127,
    "verbosity": 1,
    "seed": 42,
    "nthread": -1,
    "colsample_bytree": 0.6,
    "subsample": 0.7,
    "early_stopping_rounds": 100
}
model_xgb = []

<h4>交叉验证</h4>

In [85]:
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

x = train_data.fillna(0).astype(np.float32)
y = x.pop("V")
mse = 0
for fold, (train_index, val_index) in enumerate(kfold.split(x, y)):
    logging.info(f'############ fold: {fold} ###########')
    x_train, x_val, y_train, y_val = x.iloc[train_index], x.iloc[val_index], y.iloc[train_index], y.iloc[val_index]
    
    trainset = Dataset(x_train, y_train)
    valset = Dataset(x_val, y_val)
    model = lgb.train(params_lgb, trainset, valid_sets=[trainset, valset], callbacks=[lgb.log_evaluation(1000)])
    model.save_model("../models/lgb_%d.txt" % fold)
    model_lgb.append(model)
    lgb_pred = Series(model.predict(x_val, num_iteration=model.best_iteration), index=y_val.index).fillna(0)
    
    trainset = DMatrix(x_train, y_train, enable_categorical=True, nthread=-1)
    valset = DMatrix(x_val, y_val, enable_categorical=True, nthread=-1)
    model = xgb.train(params_xgb, trainset, evals=[(trainset, 'train'),(valset, 'eval')], num_boost_round=params_xgb["num_boost_round"], early_stopping_rounds=params_xgb["early_stopping_rounds"], verbose_eval=1000)
    model.save_model("../models/xgb_%d.json" % fold)
    model_xgb.append(model)
    xgb_pred = Series(model.predict(valset, iteration_range=(0, model.best_ntree_limit)), index=y_val.index).fillna(0)
    
    val_pred = (lgb_pred + xgb_pred) / 2
    mse += mean_squared_error(y_val.fillna(0), val_pred)
rmse = np.sqrt(mse / kfold.n_splits)
score = 1 / (1 + rmse)
logging.info(f"--------------本地分数 {score}--------------")

2025-03-04 10:11:32,581 : INFO : ############ fold: 0 ###########


[1000]	training's rmse: 454.074	valid_1's rmse: 681.159
Parameters: { "early_stopping_rounds", "num_boost_round" } are not used.

[0]	train-rmse:2757.43373	eval-rmse:2749.91546
[499]	train-rmse:739.10956	eval-rmse:827.97103


2025-03-04 10:13:12,353 : INFO : ############ fold: 1 ###########


[1000]	training's rmse: 453.181	valid_1's rmse: 675.713
Parameters: { "early_stopping_rounds", "num_boost_round" } are not used.

[0]	train-rmse:2746.91308	eval-rmse:2787.50116
[499]	train-rmse:740.63957	eval-rmse:809.90219


2025-03-04 10:14:58,161 : INFO : ############ fold: 2 ###########


[1000]	training's rmse: 454.601	valid_1's rmse: 681.448
Parameters: { "early_stopping_rounds", "num_boost_round" } are not used.

[0]	train-rmse:2750.46100	eval-rmse:2775.75243
[499]	train-rmse:738.78601	eval-rmse:836.30121


2025-03-04 10:16:47,366 : INFO : ############ fold: 3 ###########


[1000]	training's rmse: 453.032	valid_1's rmse: 677.088
Parameters: { "early_stopping_rounds", "num_boost_round" } are not used.

[0]	train-rmse:2766.98441	eval-rmse:2708.42582
[499]	train-rmse:743.27528	eval-rmse:824.46869


2025-03-04 10:18:35,689 : INFO : ############ fold: 4 ###########


[1000]	training's rmse: 454.539	valid_1's rmse: 669.662
Parameters: { "early_stopping_rounds", "num_boost_round" } are not used.

[0]	train-rmse:2755.11470	eval-rmse:2756.25535
[499]	train-rmse:740.53061	eval-rmse:819.78828


2025-03-04 10:20:27,542 : INFO : --------------本地分数 0.0013756916315924193--------------


<h3>探索性数据分析</h3>

In [36]:
train_rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3252327 entries, 0 to 3252326
Data columns (total 20 columns):
 #   Column  Dtype         
---  ------  -----         
 0   NAME    object        
 1   SENID   int64         
 2   TIME    datetime64[ns]
 3   V       float64       
 4   AVGV    float64       
 5   MAXV    float64       
 6   MAXT    object        
 7   MINV    float64       
 8   MINT    object        
 9   S       float64       
 10  AVGS    float64       
 11  MAXS    float64       
 12  MINS    float64       
 13  SPAN    int64         
 14  year    int32         
 15  month   int32         
 16  day     int32         
 17  hour    int32         
 18  minute  int32         
 19  second  int32         
dtypes: datetime64[ns](1), float64(8), int32(6), int64(2), object(3)
memory usage: 421.8+ MB


In [37]:
train_rain_one = train_rain[(train_rain["SENID"] == 210254) & (train_rain["year"] == 2015) & (train_rain["month"] == 9) & (train_rain["day"] == 26)]

In [38]:
train_rain_one.shape

(1, 20)

In [39]:
train_rain_one.sort_values("TIME")

Unnamed: 0,NAME,SENID,TIME,V,AVGV,MAXV,MAXT,MINV,MINT,S,AVGS,MAXS,MINS,SPAN,year,month,day,hour,minute,second
15828,下洋坂雨量,210254,2015-09-26 03:00:00,1.0,1.0,1.0,2015/9/26 3:00:00,1.0,2015/9/26 3:00:00,1.0,1.0,2.0,2.0,1,2015,9,26,3,0,0
