In [188]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import meteva as mem
from meteva.method import pc

In [189]:
# 定义一个回调函数，使用tqdm显示进度条
class TQDMCallback(xgb.callback.TrainingCallback):
    def __init__(self, num_boost_round):
        self.num_boost_round = num_boost_round

    def before_training(self, model):
        self.pbar = tqdm(total=self.num_boost_round, desc="Training")
        return model

    def after_iteration(self, model, epoch, evals_log):
        self.pbar.update(1)
        return False

    def after_training(self, model):
        self.pbar.close()
        return model


In [190]:
def select_features(df,thus):
    # df_corr = df.drop_duplicates(subset=['date','lon','lat'])
    df_corr_1_res = pd.DataFrame(df.corr()['fire'])
    df_corr_1_res.sort_values('fire',ascending=False)
    # df_corr_1_res.to_csv('tp_corr.csv')
    
    df_corr_1_res = df_corr_1_res.query('abs(fire)>=@thus')
    
    feats = list(set(df_corr_1_res.index))
    feats = list(filter(lambda x: x not in ['fire','date','year','month','day','lon', 'lat'] , feats))
    print(f'corr>{thus}:{feats}')
    
    feats_name = ['fire','date','year','month','day','lon', 'lat']+feats
    return feats_name

In [191]:
def process_outlier(df,feats):
    _ = df[feats[0:]]
    # 1. 计算每个气象要素的平均值和标准差
    mean_values = _.mean()
    std_values = _.std()
    # 2. 根据三倍标准差法，设定异常值的阈值
    threshold = 3 * std_values
    # 3. 遍历每个气象要素的数值，将超过设定阈值的值标记为异常值
    is_outlier = (_ > mean_values + threshold) | (_ < mean_values - threshold)
    # 4. 对于标记为异常值的数据，可以根据需要选择删除、替换或进行插补处理
    # 假设你选择删除异常值
    df = df[~is_outlier.any(axis=1)]
#     df = df.query('sw_max1<20 and sw_max2<20')
     
    return df

特征工程
---

In [192]:
data = pd.read_csv('fire_point_data.csv')

data['date'] = pd.to_datetime(data['date'])
scaler = StandardScaler()
data['date'] = scaler.fit_transform(data['date'].values.reshape(-1, 1))

data.drop(columns=['Unnamed: 0'], inplace=True)
# 对分类特征进行独热编码
data = pd.get_dummies(data, columns=['area'])

feats_name = select_features(data, 0.1)
data = process_outlier(data,feats_name)

corr>0.1:['TEM_Max', 'area_辽宁省', 'area_黑龙江省', 'TEM_Min', 'area_吉林省', 'Alti']


In [193]:
train_data = data[data['year']!=2017]
pred_data = data[data['year']==2017]

训练
---

In [194]:
# 定义特征和目标变量
X = train_data.drop('fire', axis=1)
y = train_data['fire']

# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 显示训练集的前几行
X_train.head()

Unnamed: 0,date,year,month,day,lon,lat,Alti,TEM_Max,TEM_Min,RHU_Min,...,DC,FWI,ISI,BUI,DSR,FFDI,ic,area_吉林省,area_辽宁省,area_黑龙江省
1285,-0.785566,2010,10,28,128.6092,48.6794,404.5,2.6,-6.7,29.0,...,338.42,22.77,12.86,36.63,6.87,0.38,9999.0,0,0,1
2952,1.216765,2015,7,19,122.2737,53.1638,438.5,32.4,7.2,25.0,...,481.95,60.59,20.62,184.18,38.85,0.8,33.0,0,0,1
1473,-0.599842,2011,4,6,121.7221,41.8346,167.8,14.8,9.3,21.0,...,117.21,106.84,81.62,85.74,106.03,0.82,49.0,0,1,0
2030,-0.162232,2012,4,17,120.5602,40.9935,45.447554,22.162376,1.092143,26.242468,...,436.909393,127.396797,88.642745,122.847838,145.211788,0.819448,45.190648,0,1,0
2897,1.214443,2015,7,17,121.6832,52.4246,438.5,33.2,8.1,16.0,...,463.13,63.22,22.45,174.15,41.89,0.8,42.0,0,0,1


In [195]:
# 创建DMatrix对象
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 设置参数
params = {
    'objective': 'binary:hinge',
    'eval_metric': 'error',
    'eta': 0.1,
    'max_depth': 5,
}
es = xgb.callback.EarlyStopping(
    rounds=10,
    save_best=True,
    metric_name="error",
)

# 训练模型并显示进度条
evals = [(dtrain, 'train'), (dtest, 'eval')]
callbacks = [tqdm_callback(), es] # 将进度条回调添加到callbacks列表
num_boost_round = 100
model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=evals, callbacks=[es, TQDMCallback(num_boost_round)])
# 预测测试集
y_pred = model.predict(dtest)
# y_pred_binary = [1 if p > 0.5 else 0 for p in y_pred]

# 计算准确率
accuracy = accuracy_score(y_test, y_pred_binary)
accuracy


  0%|                                                   | 0/100 [00:00<?, ?it/s][A

Training:   0%|                                         | 0/100 [00:00<?, ?it/s][A[A

[0]	train-error:0.21934	eval-error:0.20767
[1]	train-error:0.21934	eval-error:0.20767
[2]	train-error:0.21934	eval-error:0.20767
[3]	train-error:0.21934	eval-error:0.20767
[4]	train-error:0.21934	eval-error:0.20767
[5]	train-error:0.20416	eval-error:0.19808
[6]	train-error:0.17459	eval-error:0.16134
[7]	train-error:0.17659	eval-error:0.16613
[8]	train-error:0.17619	eval-error:0.16294
[9]	train-error:0.15821	eval-error:0.15176
[10]	train-error:0.14782	eval-error:0.14377
[11]	train-error:0.14503	eval-error:0.13898
[12]	train-error:0.14423	eval-error:0.13738
[13]	train-error:0.14263	eval-error:0.13578
[14]	train-error:0.13863	eval-error:0.13259
[15]	train-error:0.13464	eval-error:0.13419
[16]	train-error:0.12385	eval-error:0.12141
[17]	train-error:0.12066	eval-error:0.11981
[18]	train-error:0.11466	eval-error:0.11342
[19]	train-error:0.11506	eval-error:0.11502
[20]	train-error:0.10787	eval-error:0.10863
[21]	train-error:0.10228	eval-error:0.10863
[22]	train-error:0.09948	eval-error:0.1006



Training:  41%|████████████▋                  | 41/100 [00:00<00:00, 409.20it/s][A[A

[41]	train-error:0.04634	eval-error:0.08466
[42]	train-error:0.04195	eval-error:0.08147
[43]	train-error:0.03915	eval-error:0.07827
[44]	train-error:0.03396	eval-error:0.07827
[45]	train-error:0.03156	eval-error:0.07508
[46]	train-error:0.03076	eval-error:0.07029
[47]	train-error:0.03036	eval-error:0.07029
[48]	train-error:0.02837	eval-error:0.07029
[49]	train-error:0.02717	eval-error:0.06869
[50]	train-error:0.02477	eval-error:0.06709
[51]	train-error:0.02277	eval-error:0.06550
[52]	train-error:0.02237	eval-error:0.06709
[53]	train-error:0.01838	eval-error:0.06390
[54]	train-error:0.01798	eval-error:0.06070
[55]	train-error:0.01798	eval-error:0.05911
[56]	train-error:0.01758	eval-error:0.05911
[57]	train-error:0.01678	eval-error:0.05751
[58]	train-error:0.01478	eval-error:0.05751
[59]	train-error:0.01398	eval-error:0.05751
[60]	train-error:0.01318	eval-error:0.05911
[61]	train-error:0.01278	eval-error:0.05751
[62]	train-error:0.01278	eval-error:0.05431
[63]	train-error:0.01119	eval-er

Training:  78%|████████████████████████▏      | 78/100 [00:00<00:00, 443.04it/s]


0.950479233226837

In [196]:
model_path = 'save/fire_risk_model.xgb'

# 保存模型
model.save_model(model_path)


推理过程
---

In [197]:
# 从预测数据中分离特征
X_pred = pred_data.drop('fire', axis=1)

# 创建DMatrix对象
dpred = xgb.DMatrix(X_pred)

# 使用保存的模型进行预测
predictions = model.predict(dpred)

# 将预测结果转换为二进制分类输出
# predictions_binary = [1 if p > 0.5 else 0 for p in predictions]

pred_data['fire_prediction'] = predictions_binary

pred_data.to_csv('result.csv')

In [200]:
obs = pred_data['fire']
fo = pred_data['fire_prediction']
obs_array = obs.to_numpy()
fo_array = fo.to_numpy()
accuracy = pc(obs_array, fo_array)
accuracy


0.6722222222222223