In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
# from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import gc

import warnings
warnings.simplefilter('ignore')

In [2]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('evaluation_public.csv')

train['isnull'] = train.iloc[:,1:].isnull().mean(axis = 1)
test['isnull'] = test.iloc[:,1:].isnull().mean(axis = 1)

data = pd.concat([train, test]).reset_index(drop=True)

data['time'] = pd.to_datetime(data['time'])
data['hour'] = data['time'].dt.hour
data['minute'] = data['time'].dt.minute
data['weekday'] = data['time'].dt.weekday
data['ts'] = data['hour']*60 + data['minute']

In [3]:
train = data.iloc[:140480].reset_index(drop = True)
test = data.iloc[140480:].reset_index(drop = True)

## 对比测试集每行样本的特征缺失比，选取特征缺失比率相识的
train = train[train['isnull']<0.15].reset_index(drop=True)

train['Label1_log'] = np.log1p(train['Label1'])
train['Label2_log'] = np.log1p(train['Label2'])


## 通过特征重要性、对抗验证、特征相关性，剔除了这7个特征，可能剔除错了也可能没剔除干净
# feas = [f for f in train.columns if f not in ['time', 'Label1', 'Label2','Label1_log','Label2_log', 
#                                               'B_QY_ORP','JS_TN',
#                                               'CS_SW','MCCS_NH4','N_HYC_JS_DO','MCCS_NO3','JS_SW',
#                                              ]]
# 'Label1_log','Label2_log'
feas1 = [f for f in train.columns if f not in ['time', 'Label1', 'Label2','N_HYC_NH4','Label1_log','Label2_log',
                                              'N_HYC_XD','N_HYC_MLSS','N_HYC_JS_DO','N_HYC_DO','N_CS_MQ_SSLL','N_QY_ORP',
                                                 'hour','weekday','isnull']]

feas2 = [f for f in train.columns if f not in ['time', 'Label1', 'Label2','B_HYC_NH4','Label1_log','Label2_log',
                                              'B_HYC_XD','B_HYC_MLSS','B_HYC_JS_DO','B_HYC_DO','B_CS_MQ_SSLL','B_QY_ORP',
                                              'hour','weekday','isnull']]
train = train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)




x_trainB = train[feas1]
x_trainN = train[feas2]
x_testB = test[feas1]
x_testN = test[feas2]



# 做点比率数值特征   进/出口的比率
def add_ratio_feats(df, type_='B'):
    df['JS_CS_NH3_ratio'] = df['JS_NH3'] / (df['CS_NH3'] + 1e-3)
    df['JS_CS_TN_ratio'] = df['JS_TN'] / (df['CS_TN'] + 1e-3)
    df['JS_CS_LL_ratio']  = df['JS_LL'] / (df['CS_LL'] + 1e-3)
    df['MCCS_NH4_NH3_ratio'] = df['MCCS_NH4'] / (df['CS_NH3'] + 1e-3)
    df['MCCS_NO3_NH3_ratio'] = df['MCCS_NO3'] / (df['CS_NH3'] + 1e-3)
    df['JS_CS_COD_ratio'] = df['JS_COD'] / (df['CS_COD'] + 1e-3)
    df['JS_CS_SW_ratio'] = df['JS_SW'] / (df['CS_SW'] + 1e-3)
    df['HYC_DO_ratio'] = df[f'{type_}_HYC_JS_DO'] / (df[f'{type_}_HYC_DO'] + 1e-3)
    df['CS_MQ_LL_ratio'] = df[f'{type_}_CS_MQ_SSLL'] / (df['CS_LL'] + 1e-3)
    
    return df
x_trainB = add_ratio_feats(x_trainB, type_='B')
x_trainN = add_ratio_feats(x_trainN, type_='N')
x_testB = add_ratio_feats(x_testB, type_='B')
x_testN = add_ratio_feats(x_testN, type_='N')

In [4]:
def lgb_model(clf, train_x, train_y, test_x):
    folds = 10
    seed = 2222
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)
        fea = pd.DataFrame()

        params = {
            'learning_rate': 0.02,
            'boosting_type': 'gbdt',
            'objective': 'mse',
            'metric': 'mse',
            'verbose': -1,
            'seed': 2020,
            'n_jobs': -1,
            'max_depth':-1,
            'min_child_weight': 4,
            'num_leaves': 2 ** 4,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
        }

        model = clf.train(params, train_matrix, num_boost_round=20000, valid_sets=[train_matrix, valid_matrix], 
                          categorical_feature =[] ,verbose_eval=1000,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x: x[1], reverse=True))[:20])
            
            
        train[valid_index] = val_pred
        test = test_pred
        #标准差
        cv_scores.append(np.sqrt(mean_squared_error(np.expm1(val_y), np.expm1(val_pred))))
#         cv_scores.append(mean_squared_error(val_y, val_pred))
  
        print(cv_scores)
    test = sum(test_pre) / folds
    print("s_scotrainre_list:" , cv_scores)
    print("s_score_mean:" , np.mean(cv_scores))
    print("s_score_std:", np.std(cv_scores))

    return train, test, Feass, np.mean(cv_scores)

In [5]:
for idx, i in enumerate([[x_trainB, train['Label1_log'], x_testB], [x_trainN, train['Label2_log'], x_testN],]):
    locals()[f'lgb_train{idx}'], locals()[f'lgb_test{idx}'], \
    locals()[f'Feass{idx}'],     locals()[f'scores{idx}'] = lgb_model(lgb, i[0], i[1], i[2])

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[1000]	training's l2: 0.00935668	valid_1's l2: 0.015895
[2000]	training's l2: 0.0064533	valid_1's l2: 0.0143328
[3000]	training's l2: 0.00499404	valid_1's l2: 0.0135379
[4000]	training's l2: 0.00406647	valid_1's l2: 0.0131103
[5000]	training's l2: 0.00341676	valid_1's l2: 0.0128115
[6000]	training's l2: 0.0029235	valid_1's l2: 0.012601
[7000]	training's l2: 0.00253126	valid_1's l2: 0.0124498
[8000]	training's l2: 0.00221772	valid_1's l2: 0.012304
[9000]	training's l2: 0.00195561	valid_1's l2: 0.0122112
[10000]	training's l2: 0.00174002	valid_1's l2: 0.0121135
[11000]	training's l2: 0.0015549	valid_1's l2: 0.0120804
Early stopping, best iteration is:
[10999]	training's l2: 0.00155507	valid_1's l2: 0.0120803
[('ts', 9458), ('B_HYC_DO', 8824), ('MCCS_NH4', 8076), ('MCCS_NO3', 7919), ('B_HYC_MLSS', 7391), ('B_QY_ORP', 7321), ('JS_CS_SW_ratio', 7149), ('J

In [6]:
# 特征重要性
Feass0.groupby(['feas'])['sorce'].mean().sort_values(ascending=False).reset_index(),\
Feass1.groupby(['feas'])['sorce'].mean().sort_values(ascending=False).reset_index()

(                  feas   sorce
 0                   ts  9268.0
 1             B_HYC_DO  8455.5
 2             MCCS_NH4  7842.1
 3             MCCS_NO3  7793.7
 4             B_QY_ORP  7342.5
 5           B_HYC_MLSS  7160.1
 6       JS_CS_SW_ratio  6882.8
 7               minute  6558.0
 8                JS_SW  6487.2
 9          B_HYC_JS_DO  6465.0
 10               JS_LL  6319.4
 11        B_CS_MQ_SSLL  6299.7
 12            B_HYC_XD  6150.7
 13        HYC_DO_ratio  5922.6
 14  MCCS_NH4_NH3_ratio  5914.4
 15               CS_SW  5094.7
 16  MCCS_NO3_NH3_ratio  4695.4
 17      JS_CS_LL_ratio  4311.0
 18               CS_TN  4288.5
 19              CS_COD  4263.4
 20              JS_NH3  3981.2
 21              JS_COD  3952.6
 22               CS_LL  3939.5
 23      JS_CS_TN_ratio  3721.0
 24               JS_TN  3409.0
 25     JS_CS_COD_ratio  3404.2
 26      CS_MQ_LL_ratio  3183.0
 27     JS_CS_NH3_ratio  2665.0
 28              CS_NH3  2441.9
 29           B_HYC_NH4  2400.4,
       

In [7]:
# 线下得分
1/(1+(scores0+scores1)/2)*1000

0.8317025106550375

In [8]:
sub = pd.read_csv('sample_submission.csv')
sub['Label1'] = np.expm1(lgb_test0)
sub['Label2'] = np.expm1(lgb_test1)
sub.to_csv('1.csv', index=False)