In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import time
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold

import warnings
warnings.filterwarnings('ignore')

LABEL = '装载量'

df_train = pd.read_csv('./data/船舶装卸货量预测-训练集-20240611.csv', encoding='gbk')
df_test = pd.read_csv('./data/船舶装卸货量预测-测试集X-20240611.csv', encoding='gbk')

df = pd.concat([df_train, df_test])
df['离泊时间'] = df['离泊时间'].replace({' None': np.nan}).astype(float)
df['time_diff'] = df['离泊时间'] - df['进泊时间']

df['进泊时间'] = pd.to_datetime(df['进泊时间'], unit='s')
df['离泊时间'] = pd.to_datetime(df['离泊时间'], unit='s')

In [2]:
for f in ['进泊时间', '离泊时间']:
    # df[f+'_year'] = df[f].dt.year
    # df[f+'_month'] = df[f].dt.month
    # df[f+'_day'] = df[f].dt.day
    df[f+'_hour'] = df[f].dt.hour
    df[f+'_dayofweek'] = df[f].dt.dayofweek
    df[f+'_quarter'] = df[f].dt.quarter

df['A_le'] = df['船舶类型代码A'].factorize()[0]
df['B_le'] = df['船舶类型代码B'].factorize()[0]

df['AB_le'] = (df['船舶类型代码A'] + '_' + df['船舶类型代码B']).factorize()[0]
df['面积'] = df['船长'] * df['船宽']

for num_f in ['载重吨', 'time_diff',]:
    for cat_f in ['A_le', 'B_le','AB_le']:
        df[cat_f + '_' + num_f + '_mean'] = df.groupby(cat_f)[num_f].transform('mean')
        df[cat_f + '_' + num_f + '_std'] = df.groupby(cat_f)[num_f].transform('std')
        df[cat_f + '_' + num_f + '_max'] = df.groupby(cat_f)[num_f].transform('max')
        df[cat_f + '_' + num_f + '_min'] = df.groupby(cat_f)[num_f].transform('min')

        
        df[num_f + '_' + cat_f + '_mean'] = df.groupby(num_f)[cat_f].transform('mean')
        df[num_f + '_' + cat_f  + '_std'] = df.groupby(num_f)[cat_f].transform('std')
        df[num_f + '_' + cat_f  + '_max'] = df.groupby(num_f)[cat_f].transform('max')
        df[num_f + '_' + cat_f + '_min'] = df.groupby(num_f)[cat_f].transform('min')



df['loc0'] = df['泊位位置'].map(lambda x: float(x.split(' ')[0]))
df['loc1'] = df['泊位位置'].map(lambda x: float(x.split(' ')[1]))

# for f in ['载重吨','time_diff']:
#     df['进泊_MDH_{}_medi'.format(f)] = df.groupby(['进泊时间_year','进泊时间_month','进泊时间_day'])[f].transform('median')
    # df['进泊_MDH_{}_mean'.format(f)] = df.groupby(['进泊时间_year','进泊时间_month','进泊时间_day'])[f].transform('mean')
#     df['进泊_MDH_{}_max'.format(f)] = df.groupby(['进泊时间_year','进泊时间_month','进泊时间_day'])[f].transform('max')


df['船舶ID_航次ID_count'] = df.groupby('船舶ID')['航次ID'].transform('count')
df['船舶ID_船舶类型代码A_count'] = df.groupby('船舶ID')['船舶类型代码A'].transform('count')
df['船舶ID_船舶类型代码B_count'] = df.groupby('船舶ID')['船舶类型代码B'].transform('count')

df['船舶ID_group_count'] = df['船舶ID'].map(df['船舶ID'].value_counts())
df['船宽_group_count'] = df['船宽'].map(df['船宽'].value_counts())
df['载重吨_group_count'] = df['载重吨'].map(df['载重吨'].value_counts())

df['longitude/Latitude'] = df['loc0']/df['loc1']


#替换穷值
df = df.replace([-np.inf,np.inf],0)

df_train = df[df[LABEL].notna()]
df_test = df[df[LABEL].isna()]

feats = [f for f in df_test if f not in [LABEL, '航次ID', '船舶类型代码A', '船舶类型代码B', '泊位位置','进泊时间', '离泊时间' ,'进泊时间_year','离泊时间_year']]

print(df_train[feats].shape, df_test[feats].shape)

(2107, 72) (903, 72)


In [3]:
#lgb
def lgb_train():
    params_lgb = {'learning_rate': 0.03,'boosting_type': 'gbdt','objective': 'rmse','metric': 'rmse',
                  'num_leaves': 32,'verbose': -1,'seed': 2222,'n_jobs': -1,'feature_fraction': 0.8,
                  'bagging_fraction': 0.8,'bagging_freq': 4}
    fold_num = 10
    seeds = [22222]
    lgb_oof = np.zeros(len(df_train))
    importance = 0
    pred_y = pd.DataFrame()

    for seed in seeds:
        # kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
        kf = KFold(n_splits=fold_num, shuffle=False)
        for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
            print('-----------', fold)
            train = lgb.Dataset(df_train.loc[train_idx, feats],
                                (df_train.loc[train_idx, LABEL]))
            val = lgb.Dataset(df_train.loc[val_idx, feats],
                            (df_train.loc[val_idx, LABEL]))
            model = lgb.train(params_lgb, train, valid_sets=[val], num_boost_round=20000,
                            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(1000)])

            lgb_oof[val_idx] += model.predict(df_train.loc[val_idx, feats]) / len(seeds)
            pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
            importance += model.feature_importance(importance_type='gain') / fold_num
    #result        
    df_train['lgb_oof'] = lgb_oof
    score = mse(df_train[LABEL], df_train['lgb_oof'], squared=False)
    print('train_result',score)
    feats_importance = pd.DataFrame()
    feats_importance['name'] = feats
    feats_importance['importance'] = importance
    feats_importance.sort_values('importance', ascending=False)[:30]

    return lgb_oof,pred_y.mean(axis=1).values,feats_importance

lgb_oof,lgb_pre,lgb_imp = lgb_train()

----------- 0
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[67]	valid_0's rmse: 19422.6
----------- 1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[100]	valid_0's rmse: 25189.8
----------- 2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[179]	valid_0's rmse: 20575.4
----------- 3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[150]	valid_0's rmse: 21410.9
----------- 4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[148]	valid_0's rmse: 18828.3
----------- 5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[391]	valid_0's rmse: 19557.6
----------- 6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[412]	valid_0's rmse: 21120.5
----------- 7


In [4]:
#xgb
def xgb_train():
    params_xgb = {'booster': 'gbtree','eval_metric': 'rmse','min_child_weight': 5,'max_depth': 8,
                  'subsample': 0.5,'colsample_bytree': 0.5,'eta': 0.03,'seed': 2222,'nthread': 36,
                  'silent': True}
    fold_num = 10
    seeds = [22222]
    xgb_oof = np.zeros(len(df_train))
    # importance = 0
    pred_y = pd.DataFrame()

    for seed in seeds:
        # kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
        kf = KFold(n_splits=fold_num, shuffle=True, random_state=seed)
        for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
            print('-----------', fold)

            
            train = xgb.DMatrix(df_train.loc[train_idx, feats] , label=df_train.loc[train_idx, LABEL], missing=np.nan)
            val = xgb.DMatrix(df_train.loc[val_idx, feats] , label=df_train.loc[val_idx, LABEL], missing=np.nan)
            test  = xgb.DMatrix(df_test[feats], missing=np.nan)
            watchlist = [(train, 'train'),(val, 'eval')]

            model = xgb.train(params_xgb,train,num_boost_round=20000, evals=watchlist, verbose_eval=500,early_stopping_rounds=100)

            xgb_oof[val_idx] += model.predict(val,iteration_range=(0, model.best_iteration)) / len(seeds)
            pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(test,iteration_range=(0, model.best_iteration))
            # importance += model.feature_importance() / fold_num
    #result        
    df_train['xgb_oof'] = xgb_oof
    score = mse(df_train[LABEL], df_train['xgb_oof'], squared=False)
    print('train_result',score)
    # feats_importance = pd.DataFrame()
    # feats_importance['name'] = feats
    # feats_importance['importance'] = importance
    # feats_importance.sort_values('importance', ascending=False)[:30]
    
    return xgb_oof,pred_y.mean(axis=1).values

xgb_oof,xgb_pre = xgb_train()

----------- 0
[0]	train-rmse:52840.89103	eval-rmse:46838.40100


[229]	train-rmse:9045.06598	eval-rmse:17790.06501
----------- 1
[0]	train-rmse:52007.68892	eval-rmse:54903.05306
[257]	train-rmse:8273.89844	eval-rmse:20362.02563
----------- 2
[0]	train-rmse:52828.90204	eval-rmse:46831.35470
[178]	train-rmse:10211.11529	eval-rmse:19277.89645
----------- 3
[0]	train-rmse:51980.61709	eval-rmse:54659.29287
[329]	train-rmse:7103.23131	eval-rmse:19357.74870
----------- 4
[0]	train-rmse:52535.40889	eval-rmse:49435.10756
[422]	train-rmse:5828.52569	eval-rmse:14705.14265
----------- 5
[0]	train-rmse:52187.97902	eval-rmse:52598.36257
[182]	train-rmse:9827.82907	eval-rmse:23463.67122
----------- 6
[0]	train-rmse:51860.53083	eval-rmse:56259.81905
[316]	train-rmse:7515.73780	eval-rmse:20845.69843
----------- 7
[0]	train-rmse:52111.73201	eval-rmse:53415.59745
[316]	train-rmse:7193.65745	eval-rmse:19824.68637
----------- 8
[0]	train-rmse:52310.86945	eval-rmse:51460.36641
[356]	train-rmse:6732.96587	eval-rmse:19487.80900
----------- 9
[0]	train-rmse:51914.35142	eval

In [20]:
#平均融合
score = mse(df_train[LABEL], (df_train['xgb_oof']+df_train['lgb_oof'])/2, squared=False)
print('平均加权：',score)
df_train['oof'] = (df_train['xgb_oof']+df_train['lgb_oof'])/2

平均加权： 19426.332430943


In [10]:
#对装载量较大的船舶进行独立训练
top_50_id = list(set(df_train.sort_values(by=['装载量','船舶ID',],ascending=False)[:50]['船舶ID']))
df_train_copy = df_train.copy()
lgb_model = lgb.LGBMRegressor(verbose=-1,n_estimators=1200,learning_rate=0.18).fit(df_train_copy.loc[df_train_copy['船舶ID'].isin(top_50_id),feats],df_train_copy.loc[df_train_copy['船舶ID'].isin(top_50_id),'装载量'])
pre1 = lgb_model.predict(df_train_copy.loc[df_train_copy['船舶ID'].isin(top_50_id),feats])
df_train.loc[df_train['船舶ID'].isin(top_50_id),'oof'] = pre1
mse(df_train[LABEL], df_train['oof'], squared=False)

14279.37449446108

In [11]:
#对异常LABEL进行后处理
import matplotlib.pyplot as plt

outlier_df = df_train[['载重吨','装载量','oof']].copy()
outlier_index = outlier_df.loc[outlier_df['载重吨']<outlier_df['oof']].index
scores = []; Weights = []
best_m = 0
best_rmse=np.inf
for m in np.arange(0.1,0.9,0.01):
    df_train.loc[outlier_index,'oof'] = outlier_df.loc[outlier_df['载重吨']<outlier_df['oof'],'载重吨']*m
    rmse_score = mse(df_train[LABEL], df_train['oof'], squared=False)
    scores.append(rmse_score)
    Weights.append(m)
    if rmse_score < best_rmse:
        best_rmse = rmse_score
        best_m = m
#替换
df_train.loc[outlier_index,'oof'] = outlier_df.loc[outlier_df['载重吨']<outlier_df['oof'],'载重吨']*best_m
# PLOT
plt.figure(figsize=(20,5))
plt.plot(Weights,scores,'-o',color='blue')
plt.scatter([best_m], [best_rmse], color='blue', s=300, alpha=1)
plt.xlabel('Threshold',size=14)
plt.ylabel('Validation F1 Score',size=14)
plt.title(f'Weights vs. rmse with Best score = {best_rmse:.3f} at Best Threshold = {best_m:.3}',size=18)
plt.show()

In [12]:
#对gap较大船舶进行后处理
rmse_list = []
for i in range(len(df_train)):
    score_i = mse([df_train.loc[i,LABEL]], [df_train.loc[i,'oof']], squared=False)
    rmse_list.append(score_i)

rmse_df = pd.DataFrame({'rmse':rmse_list})

df_train_copy = df_train.copy()
gap_id = list(set(df_train.loc[rmse_df.sort_values(by='rmse',ascending=False)[:160].index]['船舶ID']))

scores = []; Weights = []
best_m3 = 0
best_rmse3=np.inf
for m in np.arange(0.8,1.2,0.01):
    df_train.loc[df_train['船舶ID'].isin(gap_id),'oof'] =df_train_copy.loc[df_train_copy['船舶ID'].isin(gap_id),'oof']*m
    rmse_score = mse(df_train[LABEL], df_train['oof'], squared=False)
    scores.append(rmse_score)
    Weights.append(m)
    if rmse_score < best_rmse3:
        best_rmse3 = rmse_score
        best_m3 = m

#替换
df_train.loc[df_train['船舶ID'].isin(gap_id),'oof'] =df_train_copy.loc[df_train_copy['船舶ID'].isin(gap_id),'oof']*best_m3

# PLOT
plt.figure(figsize=(20,5))
plt.plot(Weights,scores,'-o',color='blue')
plt.scatter([best_m3], [best_rmse3], color='blue', s=300, alpha=1)
plt.xlabel('Threshold',size=14)
plt.ylabel('Validation F1 Score',size=14)
plt.title(f'Weights vs. rmse with Best score = {best_rmse3:.3f} at Best Weights = {best_m3:.3}',size=18)
plt.show()

In [None]:
df_test[LABEL] = (xgb_pre+lgb_pre)/2
#对装载量较大的船舶进行独立预测
lgb_test_pre = lgb_model.predict(df_test.loc[df_test['船舶ID'].isin(top_50_id),feats])
df_test.loc[df_test['船舶ID'].isin(top_50_id),LABEL] = lgb_test_pre
# #对异常LABEL进行后处理
df_test.loc[df_test['载重吨']<df_test[LABEL],LABEL] = df_test['载重吨']*best_m
#对量级较大的船舶进行后处理
Large_cap_id = list(set(list(df_train[df_train['装载量']>=38239.000000]['船舶ID'])))
df_test.loc[df_test['船舶ID'].isin(Large_cap_id),LABEL] = df_test.loc[df_test['船舶ID'].isin(Large_cap_id),LABEL]*1.055
#对gap较大的船舶进行后处理
df_test.loc[df_test['船舶ID'].isin(gap_id),LABEL] =df_test.loc[df_test['船舶ID'].isin(gap_id),LABEL]*best_m3
#保存
df_test[LABEL].to_csv('./sub/lgb_'+time.strftime('%Y%m%d-%H%M%S')+'_%d.txt'%score, index=False, header=None)