<h1>导入数据</h1>

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import time
from datetime import datetime
from tqdm import tqdm
from time import strftime, localtime
from scipy import stats
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from joblib import dump, load
from tensorflow.keras.backend import epsilon
from itertools import combinations
from imblearn.over_sampling import SMOTE, ADASYN

<h1>数据处理</h1>

<h2>数据预处理</h2>

<h3>conf_info数据预处理</h3>

In [4]:
cons_info = pd.read_csv("../data/训练组_比特币挖矿_档案明细（20211220）.csv", index_col=0, quotechar='"', encoding="gbk")
cons_info

Unnamed: 0,ID,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CANCEL_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,TMP_DATE,IS_FLAG
,,,,,,,,,,,,,,
1,329465205,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005/6/18,,240.0,2005/6/18,非临时用电,,0
2,329465245,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005/6/18,,240.0,2005/6/18,非临时用电,,0
3,329465357,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005/6/18,,240.0,2005/6/18,非临时用电,,0
4,329465360,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,2.0,2.0,1.0,2005/6/18,,240.0,2005/6/18,非临时用电,,0
5,329465439,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005/6/18,,240.0,2005/6/18,非临时用电,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8836,2862711473,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019/12/2 16:12:00,,120.0,2019/12/2 16:12:00,非临时用电,,0
8837,2864274650,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,13.0,13.0,,2019/12/3 16:33:00,,120.0,2019/12/3 16:33:00,非临时用电,,0
8838,2872313739,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019/12/18 14:42:00,,120.0,2019/12/18 14:42:00,非临时用电,,0


In [5]:
cons_info = cons_info.set_index("ID").drop(columns=["CANCEL_DATE", "TMP_DATE"])
cons_info

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,IS_FLAG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
329465205,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005/6/18,240.0,2005/6/18,非临时用电,0
329465245,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005/6/18,240.0,2005/6/18,非临时用电,0
329465357,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005/6/18,240.0,2005/6/18,非临时用电,0
329465360,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,2.0,2.0,1.0,2005/6/18,240.0,2005/6/18,非临时用电,0
329465439,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005/6/18,240.0,2005/6/18,非临时用电,0
...,...,...,...,...,...,...,...,...,...,...,...
2862711473,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019/12/2 16:12:00,120.0,2019/12/2 16:12:00,非临时用电,0
2864274650,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,13.0,13.0,,2019/12/3 16:33:00,120.0,2019/12/3 16:33:00,非临时用电,0
2872313739,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019/12/18 14:42:00,120.0,2019/12/18 14:42:00,非临时用电,0
2872314719,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019/12/20 10:36:00,120.0,2019/12/20 10:36:00,非临时用电,0


In [6]:
cons_info["BUILD_DATE"] = pd.to_datetime(cons_info["BUILD_DATE"])
cons_info["LAST_CHK_DATE"] = pd.to_datetime(cons_info["LAST_CHK_DATE"])
cons_info

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,IS_FLAG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
329465205,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465245,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465357,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465360,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,2.0,2.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465439,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
...,...,...,...,...,...,...,...,...,...,...,...
2862711473,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019-12-02 16:12:00,120.0,2019-12-02 16:12:00,非临时用电,0
2864274650,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,13.0,13.0,,2019-12-03 16:33:00,120.0,2019-12-03 16:33:00,非临时用电,0
2872313739,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019-12-18 14:42:00,120.0,2019-12-18 14:42:00,非临时用电,0
2872314719,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,,2019-12-20 10:36:00,120.0,2019-12-20 10:36:00,非临时用电,0


In [7]:
cons_info["SHIFT_NO"] = cons_info["SHIFT_NO"].fillna(0)
cons_info

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,IS_FLAG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
329465205,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465245,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465357,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465360,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,2.0,2.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
329465439,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0
...,...,...,...,...,...,...,...,...,...,...,...
2862711473,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-02 16:12:00,120.0,2019-12-02 16:12:00,非临时用电,0
2864274650,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,13.0,13.0,0.0,2019-12-03 16:33:00,120.0,2019-12-03 16:33:00,非临时用电,0
2872313739,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-18 14:42:00,120.0,2019-12-18 14:42:00,非临时用电,0
2872314719,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-20 10:36:00,120.0,2019-12-20 10:36:00,非临时用电,0


In [8]:
cons_info["now"] = "2021-12-31"
cons_info["now"] = pd.to_datetime(cons_info["now"])
cons_info

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,IS_FLAG,now
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
329465205,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31
329465245,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31
329465357,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31
329465360,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,2.0,2.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31
329465439,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...
2862711473,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-02 16:12:00,120.0,2019-12-02 16:12:00,非临时用电,0,2021-12-31
2864274650,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,13.0,13.0,0.0,2019-12-03 16:33:00,120.0,2019-12-03 16:33:00,非临时用电,0,2021-12-31
2872313739,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-18 14:42:00,120.0,2019-12-18 14:42:00,非临时用电,0,2021-12-31
2872314719,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-20 10:36:00,120.0,2019-12-20 10:36:00,非临时用电,0,2021-12-31


In [9]:
cons_info["live_days"] = (cons_info["now"] - cons_info["BUILD_DATE"]).map(lambda x: x.days)
cons_info["check_days"] = (cons_info["now"] - cons_info["LAST_CHK_DATE"]).map(lambda x: x.days)
cons_info

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,IS_FLAG,now,live_days,check_days
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
329465205,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31,6040,6040
329465245,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31,6040,6040
329465357,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31,6040,6040
329465360,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,2.0,2.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31,6040,6040
329465439,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,2005-06-18 00:00:00,240.0,2005-06-18 00:00:00,非临时用电,0,2021-12-31,6040,6040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2862711473,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-02 16:12:00,120.0,2019-12-02 16:12:00,非临时用电,0,2021-12-31,759,759
2864274650,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,13.0,13.0,0.0,2019-12-03 16:33:00,120.0,2019-12-03 16:33:00,非临时用电,0,2021-12-31,758,758
2872313739,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-18 14:42:00,120.0,2019-12-18 14:42:00,非临时用电,0,2021-12-31,743,743
2872314719,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,2019-12-20 10:36:00,120.0,2019-12-20 10:36:00,非临时用电,0,2021-12-31,741,741


In [10]:
cons_info = cons_info.drop(columns=["BUILD_DATE", "LAST_CHK_DATE", "now"])
cons_info

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,TMP_NAME,IS_FLAG,live_days,check_days
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
329465205,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,240.0,非临时用电,0,6040,6040
329465245,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,4.0,4.0,1.0,240.0,非临时用电,0,6040,6040
329465357,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,240.0,非临时用电,0,6040,6040
329465360,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,2.0,2.0,1.0,240.0,非临时用电,0,6040,6040
329465439,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,8.0,8.0,1.0,240.0,非临时用电,0,6040,6040
...,...,...,...,...,...,...,...,...,...,...,...
2862711473,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,120.0,非临时用电,0,759,759
2864274650,乡村居民生活用电,交流220V,居民合表电价(不满1千伏）,13.0,13.0,0.0,120.0,非临时用电,0,758,758
2872313739,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,120.0,非临时用电,0,743,743
2872314719,乡村居民生活用电,交流380V,居民合表电价(不满1千伏）,39.0,39.0,0.0,120.0,非临时用电,0,741,741


In [11]:
cons_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8840 entries, 329465205 to 2529558569
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ELEC_TYPE_NAME  8840 non-null   object 
 1   VOLT_NAME       8840 non-null   object 
 2   PRC_NAME        8840 non-null   object 
 3   CONTRACT_CAP    8840 non-null   float64
 4   RUN_CAP         8840 non-null   float64
 5   SHIFT_NO        8840 non-null   float64
 6   CHK_CYCLE       8838 non-null   float64
 7   TMP_NAME        8840 non-null   object 
 8   IS_FLAG         8840 non-null   int64  
 9   live_days       8840 non-null   int64  
 10  check_days      8840 non-null   int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 828.8+ KB


In [12]:
cons_info = pd.get_dummies(cons_info)
cons_info

Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,IS_FLAG,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,...,ELEC_TYPE_NAME_居民生活用电,ELEC_TYPE_NAME_普通工业,ELEC_TYPE_NAME_非居民照明,ELEC_TYPE_NAME_非工业,VOLT_NAME_交流10kV,VOLT_NAME_交流220V,VOLT_NAME_交流380V,PRC_NAME_居民合表电价(1-10千伏）,PRC_NAME_居民合表电价(不满1千伏）,TMP_NAME_非临时用电
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
329465205,4.0,4.0,1.0,240.0,0,6040,6040,1,0,0,...,0,0,0,0,0,1,0,0,1,1
329465245,4.0,4.0,1.0,240.0,0,6040,6040,1,0,0,...,0,0,0,0,0,1,0,0,1,1
329465357,8.0,8.0,1.0,240.0,0,6040,6040,1,0,0,...,0,0,0,0,0,1,0,0,1,1
329465360,2.0,2.0,1.0,240.0,0,6040,6040,1,0,0,...,0,0,0,0,0,1,0,0,1,1
329465439,8.0,8.0,1.0,240.0,0,6040,6040,1,0,0,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2862711473,39.0,39.0,0.0,120.0,0,759,759,1,0,0,...,0,0,0,0,0,0,1,0,1,1
2864274650,13.0,13.0,0.0,120.0,0,758,758,1,0,0,...,0,0,0,0,0,1,0,0,1,1
2872313739,39.0,39.0,0.0,120.0,0,743,743,1,0,0,...,0,0,0,0,0,0,1,0,1,1
2872314719,39.0,39.0,0.0,120.0,0,741,741,1,0,0,...,0,0,0,0,0,0,1,0,1,1


In [13]:
cons_info["CHK_CYCLE"] = cons_info["CHK_CYCLE"].fillna(cons_info["CHK_CYCLE"].median())
cons_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8840 entries, 329465205 to 2529558569
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   CONTRACT_CAP                8840 non-null   float64
 1   RUN_CAP                     8840 non-null   float64
 2   SHIFT_NO                    8840 non-null   float64
 3   CHK_CYCLE                   8840 non-null   float64
 4   IS_FLAG                     8840 non-null   int64  
 5   live_days                   8840 non-null   int64  
 6   check_days                  8840 non-null   int64  
 7   ELEC_TYPE_NAME_乡村居民生活用电     8840 non-null   uint8  
 8   ELEC_TYPE_NAME_商业用电         8840 non-null   uint8  
 9   ELEC_TYPE_NAME_城镇居民生活用电     8840 non-null   uint8  
 10  ELEC_TYPE_NAME_学校教学和学生生活用电  8840 non-null   uint8  
 11  ELEC_TYPE_NAME_居民生活用电       8840 non-null   uint8  
 12  ELEC_TYPE_NAME_普通工业         8840 non-null   uint8  
 13  ELEC_TYPE_NAME_非居民照

<h3>elec_month数据预处理</h3>

In [14]:
cons_info["pq_z_mean"] = 0
cons_info["pq_z_median"] = 0
cons_info["pq_z_std"] = 0
cons_info["pq_z_std/mean"] = 0
cons_info["pq_z_entropy"] = 0
cons_info["pq_fg_distance"] = 0
cons_info["pq_gp_diatance"] = 0
cons_info["pq_fp_distance"] = 0
cons_info["pq_fg_cosine"] = 0
cons_info["pq_gp_cosine"] = 0
cons_info["pq_fp_cosine"] = 0
cons_info["pq_fz_cosine"] = 0
cons_info["pq_gz_cosine"] = 0
cons_info["pq_pz_cosine"] = 0
cons_info["variation_month"] = 0
cons_info["pq_z_mean_4500"] = 0
cons_info["variation_month_0085"] = 0

cons_info["pq_z_max"] = 0
cons_info["pq_z_min"] = 0

cons_info["entropy_month"] = 0
cons_info["std_month"] = 0

cons_info["diff_month"] = 0
cons_info["devide_month"] = 0

In [15]:
def entropy(data, bins: int = 10) -> float:
    """
    计算一个序列的熵\n
    """
    nums = np.histogram(data, bins=bins)[0]
    probs = nums / sum(nums)
    return stats.entropy(probs)

In [16]:
def entropy_df(df: pd.DataFrame, bins: int = 10) -> float:
    """
    计算一段时间子电量的熵\n
    """
    entropys = df.apply(lambda x: entropy(x), axis=1)
    return entropys.mean()

In [17]:
def std_df(df: pd.DataFrame) -> float:
    """
    计算一段时间子电量的标准差\n
    """
    return df.std(axis=1).mean()

In [18]:
def cosine(a, b) -> float:
    """
    计算余弦距离\n
    """
    if np.linalg.norm(a) != 0 and np.linalg.norm(b) != 0:
        return distance.cosine(a, b)
    elif np.linalg.norm(a) == 0 and np.linalg.norm(b) == 0:
        return 0
    else:
        return 1

In [19]:
def diff_median(df: pd.DataFrame) -> float:
    """
    计算一个df的两列数值差的中位数\n
    """
    results = []
    for i, j in combinations(list(range(df.shape[1])), r=2):
        result = (df.iloc[:, i] - df.iloc[:, j]).apply(lambda x: abs(x)).median()
        results.append(result)
    return sum(results)

In [20]:
def devide_median(df: pd.DataFrame) -> float:
    """
    计算两个列的相除的中位数\n
    """
    results = []
    for i, j in combinations(list(range(df.shape[1])), r=2):
        result = (df.iloc[:, 0] / (df.iloc[:, 1] + epsilon())).median()
        results.append(result)
    return sum(results)

In [21]:
def variation(df: pd.DataFrame) -> float:
    """
    计算变异系数\n
    """
    tt =  (epsilon() + df.std(axis=1)) / (epsilon() + df.mean(axis=1))
    tt[tt.apply(lambda x: np.isinf(x))] = 0
    return tt.mean(axis=0)

In [22]:
elec_month = pd.read_csv("../data/训练组_比特币挖矿_月用电明细（20211217）.csv")
elec_month

Unnamed: 0,id,ym,pq_f,pq_g,pq_p,pq_z
0,329769614,202001,615,161,0,776
1,329769614,202002,492,154,-1,645
2,329769614,202003,457,114,1,572
3,329769614,202004,522,171,0,693
4,329769614,202005,519,181,0,700
...,...,...,...,...,...,...
194475,2878334859,202106,107,67,-1,173
194476,2878334859,202107,170,112,1,283
194477,2878334859,202108,102,67,0,169
194478,2878334859,202109,112,65,-1,176


In [23]:
elec_month["ym"] = elec_month["ym"].astype(str)
elec_month

Unnamed: 0,id,ym,pq_f,pq_g,pq_p,pq_z
0,329769614,202001,615,161,0,776
1,329769614,202002,492,154,-1,645
2,329769614,202003,457,114,1,572
3,329769614,202004,522,171,0,693
4,329769614,202005,519,181,0,700
...,...,...,...,...,...,...
194475,2878334859,202106,107,67,-1,173
194476,2878334859,202107,170,112,1,283
194477,2878334859,202108,102,67,0,169
194478,2878334859,202109,112,65,-1,176


In [24]:
for cons_id in tqdm(cons_info.index):
    df = elec_month[elec_month["id"] == cons_id]
    df = df.drop(columns=["id"]).sort_values("ym").set_index("ym")
    df = df.fillna(method="ffill").fillna(method="bfill")
    cons_info.loc[cons_id, "pq_z_mean"] = df["pq_z"].mean()
    cons_info.loc[cons_id, "pq_z_median"] = df["pq_z"].median()
    cons_info.loc[cons_id, "pq_z_std"] = df["pq_z"].std()
    cons_info.loc[(cons_info.index == cons_id) & (cons_info["pq_z_mean"] > 0), "pq_z_std/mean"] = cons_info["pq_z_std"] / cons_info["pq_z_mean"]
    cons_info.loc[cons_id, "pq_z_entropy"] = entropy(df["pq_z"])
    
    cons_info.loc[cons_id, "pq_fg_distance"] = distance.euclidean(df["pq_f"], df["pq_g"])
    cons_info.loc[cons_id, "pq_gp_diatance"] = distance.euclidean(df["pq_g"], df["pq_p"])
    cons_info.loc[cons_id, "pq_fp_distance"] = distance.euclidean(df["pq_f"], df["pq_p"])
    
    cons_info.loc[cons_id, "pq_fg_cosine"] = cosine(df["pq_f"], df["pq_g"])
    cons_info.loc[cons_id, "pq_gp_cosine"] = cosine(df["pq_g"], df["pq_p"])
    cons_info.loc[cons_id, "pq_fp_cosine"] = cosine(df["pq_f"], df["pq_p"])
    cons_info.loc[cons_id, "pq_fz_cosine"] = cosine(df["pq_f"], df["pq_z"])
    cons_info.loc[cons_id, "pq_gz_cosine"] = cosine(df["pq_g"], df["pq_z"])
    cons_info.loc[cons_id, "pq_pz_cosine"] = cosine(df["pq_p"], df["pq_z"])
    
    cons_info.loc[cons_id, "variation_month"] = variation(df[["pq_f", "pq_g", "pq_p"]])
    cons_info.loc[cons_id, "pq_z_mean_4500"] = (cons_info.loc[cons_id, "pq_z_mean"] > 4500).astype(float)
    cons_info.loc[cons_id, "variation_month_0085"] = (cons_info.loc[cons_id, "variation_month"] < 0.085).astype(float)
    
    cons_info.loc[cons_id, "pq_z_max"] = df["pq_z"].max()
    cons_info.loc[cons_id, "pq_z_min"] = df["pq_z"].min()
    
    cons_info.loc[cons_id, "entropy_month"] = entropy_df(df[["pq_f", "pq_g", "pq_p"]])
    cons_info.loc[cons_id, "std_month"] = std_df(df[["pq_f", "pq_g", "pq_p"]])
    
    cons_info.loc[cons_id, "diff_month"] = diff_median(df[["pq_f", "pq_g", "pq_p"]])
    cons_info.loc[cons_id, "devide_month"] = devide_median(df[["pq_f", "pq_g", "pq_p"]])    

    
cons_info

100%|████████████████████████████████████████████████████████████████████████████████| 8840/8840 [02:52<00:00, 51.20it/s]


Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,IS_FLAG,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,...,pq_pz_cosine,variation_month,pq_z_mean_4500,variation_month_0085,pq_z_max,pq_z_min,entropy_month,std_month,diff_month,devide_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
329465205,4.0,4.0,1.0,240.0,0,6040,6040,1,0,0,...,1.054778,0.985564,0.0,0.0,1440,218,1.098612,236.742067,915.0,5.338216
329465245,4.0,4.0,1.0,240.0,0,6040,6040,1,0,0,...,1.048902,1.036198,0.0,0.0,2263,613,1.098612,356.826146,1349.5,6.739272
329465357,8.0,8.0,1.0,240.0,0,6040,6040,1,0,0,...,0.028826,1.653016,0.0,0.0,3401,1186,0.678523,941.880880,3151.0,0.000000
329465360,2.0,2.0,1.0,240.0,0,6040,6040,1,0,0,...,0.856495,1.078517,0.0,0.0,2240,553,1.098612,384.279868,1495.5,7.056769
329465439,8.0,8.0,1.0,240.0,0,6040,6040,1,0,0,...,0.000000,1.732051,0.0,0.0,2211,678,0.636514,681.299561,2319.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2862711473,39.0,39.0,0.0,120.0,0,759,759,1,0,0,...,1.074872,0.991196,0.0,0.0,3044,808,1.098612,532.213416,2065.5,5.876878
2864274650,13.0,13.0,0.0,120.0,0,758,758,1,0,0,...,1.128561,0.998110,0.0,0.0,1670,53,1.098612,148.889645,122.0,5.550620
2872313739,39.0,39.0,0.0,120.0,0,743,743,1,0,0,...,1.000000,1.055087,0.0,0.0,10,0,0.078869,0.329195,0.0,0.000000
2872314719,39.0,39.0,0.0,120.0,0,741,741,1,0,0,...,0.918036,1.021503,0.0,0.0,1120,2,1.077608,85.509035,228.5,6.121250


<h3>elec_day数据预处理</h3>

In [25]:
cons_info["kwh_mean"] = 0
cons_info["kwh_median"] = 0
cons_info["kwh_std"] = 0
cons_info["kwh_std/mean"] = 0
cons_info["kwh_entropy"] = 0

cons_info["kwh_pap_r23_distance"] = 0
cons_info["kwh_pap_r24_distance"] = 0
cons_info["kwh_pap_r34_distance"] = 0

cons_info["kwh_pap_r23_cosine"] = 0
cons_info["kwh_pap_r24_cosine"] = 0
cons_info["kwh_pap_r34_cosine"] = 0

cons_info["kwh_pap_r1_cosine"] = 0
cons_info["kwh_pap_r2_cosine"] = 0
cons_info["kwh_pap_r3_cosine"] = 0
cons_info["kwh_pap_r4_cosine"] = 0
cons_info["variation_day"] = 0

cons_info["kwh_max"] = 0
cons_info["kwh_min"] = 0

cons_info["entropy_day"] = 0
cons_info["std_day"] = 0

cons_info["diff_day"] = 0
cons_info["devide_day"] = 0

In [26]:
elec_day = pd.read_csv("../data/训练组_比特币挖矿_日用电明细（20211217）.csv")
elec_day

Unnamed: 0,id,rq,kwh,kwh_rap,kwh_pap_r1,kwh_pap_r2,kwh_pap_r3,kwh_pap_r4
0,329449398,2020-01-22 00:00:00,11.77,0.0,0.0,7.40,0.0,4.36
1,329449398,2020-01-23 00:00:00,16.54,0.0,0.0,8.68,0.0,7.86
2,329449398,2020-01-24 00:00:00,10.86,0.0,0.0,5.51,0.0,5.36
3,329449398,2020-01-25 00:00:00,10.91,0.0,0.0,6.48,0.0,4.43
4,329449398,2020-01-26 00:00:00,10.85,0.0,0.0,6.32,0.0,4.52
...,...,...,...,...,...,...,...,...
954564,2878334859,2021-10-05 00:00:00,7.22,,0.0,4.49,0.0,2.73
954565,2878334859,2021-10-06 00:00:00,7.23,,0.0,4.48,0.0,2.74
954566,2878334859,2021-10-07 00:00:00,7.20,,0.0,4.48,0.0,2.73
954567,2878334859,2021-10-08 00:00:00,7.23,,0.0,4.50,0.0,2.73


In [27]:
elec_day["rq"] = pd.to_datetime(elec_day["rq"])
elec_day

Unnamed: 0,id,rq,kwh,kwh_rap,kwh_pap_r1,kwh_pap_r2,kwh_pap_r3,kwh_pap_r4
0,329449398,2020-01-22,11.77,0.0,0.0,7.40,0.0,4.36
1,329449398,2020-01-23,16.54,0.0,0.0,8.68,0.0,7.86
2,329449398,2020-01-24,10.86,0.0,0.0,5.51,0.0,5.36
3,329449398,2020-01-25,10.91,0.0,0.0,6.48,0.0,4.43
4,329449398,2020-01-26,10.85,0.0,0.0,6.32,0.0,4.52
...,...,...,...,...,...,...,...,...
954564,2878334859,2021-10-05,7.22,,0.0,4.49,0.0,2.73
954565,2878334859,2021-10-06,7.23,,0.0,4.48,0.0,2.74
954566,2878334859,2021-10-07,7.20,,0.0,4.48,0.0,2.73
954567,2878334859,2021-10-08,7.23,,0.0,4.50,0.0,2.73


In [28]:
for cons_id in tqdm(cons_info.index):
    df = elec_day[elec_day["id"] == cons_id]
    df = df.drop(columns=["id"]).sort_values("rq").set_index("rq")
    df = df.fillna(method="ffill").fillna(method="bfill")
    cons_info.loc[cons_id, "kwh_mean"] = df["kwh"].mean()
    cons_info.loc[cons_id, "kwh_median"] = df["kwh"].median()
    cons_info.loc[cons_id, "kwh_std"] = df["kwh"].std()
    cons_info.loc[(cons_info.index == cons_id) & (cons_info["kwh_mean"] > 0), "kwh_std/mean"] = cons_info["kwh_std"] / cons_info["kwh_mean"]
    cons_info.loc[cons_id, "kwh_entropy"] = entropy(df["kwh"])

    cons_info.loc[cons_id, "kwh_pap_r23_distance"] = distance.euclidean(df["kwh_pap_r2"], df["kwh_pap_r3"])
    cons_info.loc[cons_id, "kwh_pap_r24_distance"] = distance.euclidean(df["kwh_pap_r2"], df["kwh_pap_r4"])
    cons_info.loc[cons_id, "kwh_pap_r34_distance"] = distance.euclidean(df["kwh_pap_r3"], df["kwh_pap_r4"])

    cons_info.loc[cons_id, "kwh_pap_r23_cosine"] = cosine(df["kwh_pap_r2"], df["kwh_pap_r3"])
    cons_info.loc[cons_id, "kwh_pap_r24_cosine"] = cosine(df["kwh_pap_r2"], df["kwh_pap_r4"])
    cons_info.loc[cons_id, "kwh_pap_r34_cosine"] = cosine(df["kwh_pap_r3"], df["kwh_pap_r4"])
    
    cons_info.loc[cons_id, "kwh_pap_r1_cosine"] = cosine(df["kwh_pap_r1"], df["kwh"])
    cons_info.loc[cons_id, "kwh_pap_r2_cosine"] = cosine(df["kwh_pap_r2"], df["kwh"])
    cons_info.loc[cons_id, "kwh_pap_r3_cosine"] = cosine(df["kwh_pap_r3"], df["kwh"])
    cons_info.loc[cons_id, "kwh_pap_r4_cosine"] = cosine(df["kwh_pap_r4"], df["kwh"])
    
    cons_info.loc[cons_id, "variation_day"] = variation(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    
    cons_info.loc[cons_id, "kwh_max"] = df["kwh"].max()
    cons_info.loc[cons_id, "kwh_min"] = df["kwh"].min()
    
    cons_info.loc[cons_id, "entropy_day"] = entropy_df(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    cons_info.loc[cons_id, "std_day"] = std_df(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    
    cons_info.loc[cons_id, "diff_day"] = diff_median(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    cons_info.loc[cons_id, "devide_day"] = devide_median(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    
    
cons_info

100%|████████████████████████████████████████████████████████████████████████████████| 8840/8840 [04:51<00:00, 30.35it/s]


Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,IS_FLAG,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,...,kwh_pap_r2_cosine,kwh_pap_r3_cosine,kwh_pap_r4_cosine,variation_day,kwh_max,kwh_min,entropy_day,std_day,diff_day,devide_day
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
329465205,4.0,4.0,1.0,240.0,0,6040,6040,1,0,0,...,0.013884,1.0,0.048088,1.039291,45.71,0.66,1.060104,6.192106,20.875,3.454500e+08
329465245,4.0,4.0,1.0,240.0,0,6040,6040,1,0,0,...,0.004351,1.0,0.019658,1.045283,54.73,17.88,1.098612,12.109005,47.185,7.074000e+08
329465357,8.0,8.0,1.0,240.0,0,6040,6040,1,0,0,...,0.001687,1.0,0.024876,1.236592,79.75,26.31,1.090055,22.801373,88.530,1.329000e+09
329465360,2.0,2.0,1.0,240.0,0,6040,6040,1,0,0,...,0.002899,1.0,0.017558,1.103814,74.44,14.16,1.098612,12.624684,49.825,7.327500e+08
329465439,8.0,8.0,1.0,240.0,0,6040,6040,1,0,0,...,0.003703,1.0,0.023050,1.094278,52.60,16.35,1.098612,12.514458,47.980,7.183500e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2862711473,39.0,39.0,0.0,120.0,0,759,759,1,0,0,...,0.005853,1.0,0.019615,0.962473,74.60,13.68,1.085776,12.263838,42.910,6.667500e+08
2864274650,13.0,13.0,0.0,120.0,0,758,758,1,0,0,...,0.002854,1.0,0.013023,0.965715,42.91,1.50,1.085776,4.196080,4.035,5.565000e+07
2872313739,39.0,39.0,0.0,120.0,0,743,743,1,0,0,...,0.000000,0.0,0.000000,1.000000,0.00,0.00,0.000000,0.000000,0.000,0.000000e+00
2872314719,39.0,39.0,0.0,120.0,0,741,741,1,0,0,...,0.011304,1.0,0.135482,1.030999,60.99,4.68,1.034432,2.761213,7.980,1.218000e+08


<h2>数据集划分</h2>

In [29]:
x = cons_info.drop(columns=["IS_FLAG"])
y = cons_info["IS_FLAG"]
print(x.shape, y.shape)

(8840, 65) (8840,)


<h2>标准化</h2>

In [30]:
ss = StandardScaler()
x = ss.fit_transform(x)
x

array([[-0.31260749, -0.31917163,  1.05812375, ...,  0.05176978,
         0.04269137,  0.55764591],
       [-0.31260749, -0.31917163,  1.05812375, ...,  0.30898424,
         0.34491997,  1.84521864],
       [-0.2464131 , -0.25086739,  1.05812375, ...,  0.77379383,
         0.81985884,  4.05645004],
       ...,
       [ 0.2665934 ,  0.27849043, -0.86811471, ..., -0.21740823,
        -0.19710423, -0.671231  ],
       [ 0.2665934 ,  0.27849043, -0.86811471, ..., -0.09737511,
        -0.10543626, -0.23794917],
       [ 0.11765603,  0.1248059 , -0.86811471, ...,  0.14671492,
         0.17812562,  1.06883309]])

<h1>做出来一个模型</h1>

In [31]:
smo = ADASYN(random_state=42, n_jobs=-1)
x, y = smo.fit_sample(x, y)

In [32]:
model = LGBMClassifier(
    objective="binary", 
    n_jobs=-1
)

In [33]:
cv = GridSearchCV(
    estimator=model,
    param_grid={
        "n_estimators": list(range(50, 700, 50)),
        "num_leaves": list(range(1, 50, 5)),
        "max_depth": [-1] + list(range(2, 11, 2)),
        "subsample_for_bin": list(range(100000, 700000, 100000))
    },
    scoring="f1",
    n_jobs=-1,
    iid=False,
    cv=10,
    verbose=1
)

In [34]:
try:
    model = load("../models/best_lightBGM.pkl")
    print("n_estimators", model.n_estimators)
    print("num_leaves", model.num_leaves)
    print("max_depth", model.max_depth)
    print("subsample_for_bin", model.subsample_for_bin)
    f1 = f1_score(y, model.predict(x))
except FileNotFoundError as e:
    cv.fit(x, y)
    model = cv.best_estimator_
    print(cv.best_params_)
    f1 = cv.best_score_
    model.fit(x, y)
    dump(model, "../models/best_lightBGM.pkl")

n_estimators 200
num_leaves 16
max_depth 6
subsample_for_bin 100000


In [35]:
f1

1.0

In [36]:
accuracy_score(y, model.predict(x))

1.0

In [37]:
roc_auc_score(y, model.predict(x))

1.0

In [73]:
features = list(cons_info.columns)
features.pop(features.index("IS_FLAG"))
for i, importance in enumerate(model.feature_importances_):
    print(features[i] + "\t" + str(importance))

CONTRACT_CAP	77
RUN_CAP	4
SHIFT_NO	56
CHK_CYCLE	13
live_days	76
check_days	58
ELEC_TYPE_NAME_乡村居民生活用电	0
ELEC_TYPE_NAME_商业用电	0
ELEC_TYPE_NAME_城镇居民生活用电	0
ELEC_TYPE_NAME_学校教学和学生生活用电	0
ELEC_TYPE_NAME_居民生活用电	0
ELEC_TYPE_NAME_普通工业	0
ELEC_TYPE_NAME_非居民照明	0
ELEC_TYPE_NAME_非工业	0
VOLT_NAME_交流10kV	0
VOLT_NAME_交流220V	0
VOLT_NAME_交流380V	0
PRC_NAME_居民合表电价(1-10千伏）	0
PRC_NAME_居民合表电价(不满1千伏）	0
TMP_NAME_非临时用电	0
pq_z_mean	30
pq_z_median	26
pq_z_std	197
pq_z_std/mean	20
pq_z_entropy	90
pq_fg_distance	19
pq_gp_diatance	13
pq_fp_distance	15
pq_fg_cosine	16
pq_gp_cosine	15
pq_fp_cosine	11
pq_fz_cosine	18
pq_gz_cosine	14
pq_pz_cosine	46
variation_month	74
pq_z_mean_4500	71
variation_month_0085	52
pq_z_max	59
pq_z_min	94
entropy_month	109
std_month	17
diff_month	64
devide_month	98
kwh_mean	53
kwh_median	31
kwh_std	59
kwh_std/mean	16
kwh_entropy	20
kwh_pap_r23_distance	16
kwh_pap_r24_distance	20
kwh_pap_r34_distance	7
kwh_pap_r23_cosine	25
kwh_pap_r24_cosine	7
kwh_pap_r34_cosine	4
kwh_pap_r1_cosine	0
kwh_pap_r2_

<h1>处理测试数据</h1>

<h2>数据预处理</h2>

<h3>档案数据预处理</h3>

In [38]:
cons_info_test = pd.read_csv("../data/测试组_比特币挖矿_档案明细（20211220）.csv", index_col=0, quotechar='"', encoding="gbk")
cons_info_test

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CANCEL_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,TMP_DATE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
179406029,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998/12/1,,24,1998/12/1,非临时用电,
179406030,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,1998/12/1,,24,1998/12/1,非临时用电,
179406094,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1999/4/1,,24,1999/4/1,非临时用电,
179406097,城镇居民生活用电,交流220V,居民生活<1kV(合表),4.0,4.0,1.0,1999/3/1,,36,1999/3/1,非临时用电,
179406099,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998/12/1,,24,1998/12/1,非临时用电,
...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,乡村居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,2019/11/8 12:42:00,,120,2019/11/8 12:42:00,非临时用电,
2850017469,乡村居民生活用电,交流380V,居民生活<1kV(合表),120.0,120.0,,2019/11/18 14:08:00,,120,2019/11/18 14:08:00,非临时用电,
2850017472,乡村居民生活用电,交流380V,居民生活<1kV(合表),720.0,720.0,,2019/11/18 14:08:00,,120,2019/11/18 14:08:00,非临时用电,
2851323065,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,2019/11/11 15:29:00,,120,2019/11/11 15:29:00,非临时用电,


In [39]:
cons_info_test = cons_info_test.drop(columns=["CANCEL_DATE", "TMP_DATE"])
cons_info_test

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
179406029,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998/12/1,24,1998/12/1,非临时用电
179406030,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,1998/12/1,24,1998/12/1,非临时用电
179406094,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1999/4/1,24,1999/4/1,非临时用电
179406097,城镇居民生活用电,交流220V,居民生活<1kV(合表),4.0,4.0,1.0,1999/3/1,36,1999/3/1,非临时用电
179406099,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998/12/1,24,1998/12/1,非临时用电
...,...,...,...,...,...,...,...,...,...,...
2849971818,乡村居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,2019/11/8 12:42:00,120,2019/11/8 12:42:00,非临时用电
2850017469,乡村居民生活用电,交流380V,居民生活<1kV(合表),120.0,120.0,,2019/11/18 14:08:00,120,2019/11/18 14:08:00,非临时用电
2850017472,乡村居民生活用电,交流380V,居民生活<1kV(合表),720.0,720.0,,2019/11/18 14:08:00,120,2019/11/18 14:08:00,非临时用电
2851323065,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,2019/11/11 15:29:00,120,2019/11/11 15:29:00,非临时用电


In [40]:
cons_info_test["BUILD_DATE"] = pd.to_datetime(cons_info_test["BUILD_DATE"])
cons_info_test["LAST_CHK_DATE"] = pd.to_datetime(cons_info_test["LAST_CHK_DATE"])
cons_info_test

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
179406029,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电
179406030,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电
179406094,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1999-04-01 00:00:00,24,1999-04-01 00:00:00,非临时用电
179406097,城镇居民生活用电,交流220V,居民生活<1kV(合表),4.0,4.0,1.0,1999-03-01 00:00:00,36,1999-03-01 00:00:00,非临时用电
179406099,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电
...,...,...,...,...,...,...,...,...,...,...
2849971818,乡村居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,2019-11-08 12:42:00,120,2019-11-08 12:42:00,非临时用电
2850017469,乡村居民生活用电,交流380V,居民生活<1kV(合表),120.0,120.0,,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电
2850017472,乡村居民生活用电,交流380V,居民生活<1kV(合表),720.0,720.0,,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电
2851323065,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,,2019-11-11 15:29:00,120,2019-11-11 15:29:00,非临时用电


In [41]:
cons_info_test["SHIFT_NO"] = cons_info_test["SHIFT_NO"].fillna(0)
cons_info_test

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
179406029,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电
179406030,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电
179406094,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1999-04-01 00:00:00,24,1999-04-01 00:00:00,非临时用电
179406097,城镇居民生活用电,交流220V,居民生活<1kV(合表),4.0,4.0,1.0,1999-03-01 00:00:00,36,1999-03-01 00:00:00,非临时用电
179406099,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电
...,...,...,...,...,...,...,...,...,...,...
2849971818,乡村居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,2019-11-08 12:42:00,120,2019-11-08 12:42:00,非临时用电
2850017469,乡村居民生活用电,交流380V,居民生活<1kV(合表),120.0,120.0,0.0,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电
2850017472,乡村居民生活用电,交流380V,居民生活<1kV(合表),720.0,720.0,0.0,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电
2851323065,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,2019-11-11 15:29:00,120,2019-11-11 15:29:00,非临时用电


In [42]:
cons_info_test["now"] = "2021-12-31"
cons_info_test["now"] = pd.to_datetime(cons_info_test["now"])
cons_info_test

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,now
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
179406029,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电,2021-12-31
179406030,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电,2021-12-31
179406094,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1999-04-01 00:00:00,24,1999-04-01 00:00:00,非临时用电,2021-12-31
179406097,城镇居民生活用电,交流220V,居民生活<1kV(合表),4.0,4.0,1.0,1999-03-01 00:00:00,36,1999-03-01 00:00:00,非临时用电,2021-12-31
179406099,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电,2021-12-31
...,...,...,...,...,...,...,...,...,...,...,...
2849971818,乡村居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,2019-11-08 12:42:00,120,2019-11-08 12:42:00,非临时用电,2021-12-31
2850017469,乡村居民生活用电,交流380V,居民生活<1kV(合表),120.0,120.0,0.0,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电,2021-12-31
2850017472,乡村居民生活用电,交流380V,居民生活<1kV(合表),720.0,720.0,0.0,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电,2021-12-31
2851323065,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,2019-11-11 15:29:00,120,2019-11-11 15:29:00,非临时用电,2021-12-31


In [43]:
cons_info_test["live_days"] = (cons_info_test["now"] - cons_info_test["BUILD_DATE"]).map(lambda x: x.days)
cons_info_test["check_days"] = (cons_info_test["now"] - cons_info_test["LAST_CHK_DATE"]).map(lambda x: x.days)
cons_info_test

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,BUILD_DATE,CHK_CYCLE,LAST_CHK_DATE,TMP_NAME,now,live_days,check_days
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
179406029,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电,2021-12-31,8431,8431
179406030,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电,2021-12-31,8431,8431
179406094,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1999-04-01 00:00:00,24,1999-04-01 00:00:00,非临时用电,2021-12-31,8310,8310
179406097,城镇居民生活用电,交流220V,居民生活<1kV(合表),4.0,4.0,1.0,1999-03-01 00:00:00,36,1999-03-01 00:00:00,非临时用电,2021-12-31,8341,8341
179406099,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,1998-12-01 00:00:00,24,1998-12-01 00:00:00,非临时用电,2021-12-31,8431,8431
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,乡村居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,2019-11-08 12:42:00,120,2019-11-08 12:42:00,非临时用电,2021-12-31,783,783
2850017469,乡村居民生活用电,交流380V,居民生活<1kV(合表),120.0,120.0,0.0,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电,2021-12-31,773,773
2850017472,乡村居民生活用电,交流380V,居民生活<1kV(合表),720.0,720.0,0.0,2019-11-18 14:08:00,120,2019-11-18 14:08:00,非临时用电,2021-12-31,773,773
2851323065,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,2019-11-11 15:29:00,120,2019-11-11 15:29:00,非临时用电,2021-12-31,780,780


In [44]:
cons_info_test = cons_info_test.drop(columns=["BUILD_DATE", "LAST_CHK_DATE", "now"])
cons_info_test

Unnamed: 0_level_0,ELEC_TYPE_NAME,VOLT_NAME,PRC_NAME,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,TMP_NAME,live_days,check_days
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
179406029,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,24,非临时用电,8431,8431
179406030,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,24,非临时用电,8431,8431
179406094,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,24,非临时用电,8310,8310
179406097,城镇居民生活用电,交流220V,居民生活<1kV(合表),4.0,4.0,1.0,36,非临时用电,8341,8341
179406099,城镇居民生活用电,交流220V,居民生活<1kV(合表),2.0,2.0,1.0,24,非临时用电,8431,8431
...,...,...,...,...,...,...,...,...,...,...
2849971818,乡村居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,120,非临时用电,783,783
2850017469,乡村居民生活用电,交流380V,居民生活<1kV(合表),120.0,120.0,0.0,120,非临时用电,773,773
2850017472,乡村居民生活用电,交流380V,居民生活<1kV(合表),720.0,720.0,0.0,120,非临时用电,773,773
2851323065,城镇居民生活用电,交流220V,居民生活<1kV(合表),12.0,12.0,0.0,120,非临时用电,780,780


In [45]:
cons_info_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15379 entries, 179406029 to 2852368013
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ELEC_TYPE_NAME  15379 non-null  object 
 1   VOLT_NAME       15379 non-null  object 
 2   PRC_NAME        15379 non-null  object 
 3   CONTRACT_CAP    15379 non-null  float64
 4   RUN_CAP         15379 non-null  float64
 5   SHIFT_NO        15379 non-null  float64
 6   CHK_CYCLE       15379 non-null  int64  
 7   TMP_NAME        15379 non-null  object 
 8   live_days       15379 non-null  int64  
 9   check_days      15379 non-null  int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 1.3+ MB


In [46]:
cons_info_test = pd.get_dummies(cons_info_test)
cons_info_test

Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_农业生产用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,...,ELEC_TYPE_NAME_非居民照明,ELEC_TYPE_NAME_非工业,VOLT_NAME_交流10kV,VOLT_NAME_交流220V,VOLT_NAME_交流380V,VOLT_NAME_交流6kV,PRC_NAME_居民合表电价(1-10千伏）,PRC_NAME_居民生活1_10kV(合表),PRC_NAME_居民生活<1kV(合表),TMP_NAME_非临时用电
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
179406030,12.0,12.0,0.0,24,8431,8431,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
179406094,2.0,2.0,1.0,24,8310,8310,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
179406097,4.0,4.0,1.0,36,8341,8341,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
179406099,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,...,0,0,0,1,0,0,0,0,1,1
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2851323065,12.0,12.0,0.0,120,780,780,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1


In [47]:
cons_info_test["CHK_CYCLE"] = cons_info_test["CHK_CYCLE"].fillna(cons_info_test["CHK_CYCLE"].median())
cons_info_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15379 entries, 179406029 to 2852368013
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   CONTRACT_CAP                15379 non-null  float64
 1   RUN_CAP                     15379 non-null  float64
 2   SHIFT_NO                    15379 non-null  float64
 3   CHK_CYCLE                   15379 non-null  int64  
 4   live_days                   15379 non-null  int64  
 5   check_days                  15379 non-null  int64  
 6   ELEC_TYPE_NAME_乡村居民生活用电     15379 non-null  uint8  
 7   ELEC_TYPE_NAME_农业生产用电       15379 non-null  uint8  
 8   ELEC_TYPE_NAME_商业用电         15379 non-null  uint8  
 9   ELEC_TYPE_NAME_城镇居民生活用电     15379 non-null  uint8  
 10  ELEC_TYPE_NAME_大工业用电        15379 non-null  uint8  
 11  ELEC_TYPE_NAME_学校教学和学生生活用电  15379 non-null  uint8  
 12  ELEC_TYPE_NAME_居民生活用电       15379 non-null  uint8  
 13  ELEC_TYPE_NAME_普通工

In [48]:
cons_info_test["PRC_NAME_居民合表电价(1-10千伏）"] = cons_info_test["PRC_NAME_居民合表电价(1-10千伏）"] + cons_info_test["PRC_NAME_居民生活1_10kV(合表)"]
cons_info_test = cons_info_test.drop(columns=["PRC_NAME_居民生活1_10kV(合表)"])
cons_info_test

Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_农业生产用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,...,ELEC_TYPE_NAME_普通工业,ELEC_TYPE_NAME_非居民照明,ELEC_TYPE_NAME_非工业,VOLT_NAME_交流10kV,VOLT_NAME_交流220V,VOLT_NAME_交流380V,VOLT_NAME_交流6kV,PRC_NAME_居民合表电价(1-10千伏）,PRC_NAME_居民生活<1kV(合表),TMP_NAME_非临时用电
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406030,12.0,12.0,0.0,24,8431,8431,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406094,2.0,2.0,1.0,24,8310,8310,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406097,4.0,4.0,1.0,36,8341,8341,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406099,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2851323065,12.0,12.0,0.0,120,780,780,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1


In [49]:
cons_info_test = cons_info_test.rename(columns={"PRC_NAME_居民生活<1kV(合表)": "PRC_NAME_居民合表电价(不满1千伏）"})
cons_info_test

Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_农业生产用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,...,ELEC_TYPE_NAME_普通工业,ELEC_TYPE_NAME_非居民照明,ELEC_TYPE_NAME_非工业,VOLT_NAME_交流10kV,VOLT_NAME_交流220V,VOLT_NAME_交流380V,VOLT_NAME_交流6kV,PRC_NAME_居民合表电价(1-10千伏）,PRC_NAME_居民合表电价(不满1千伏）,TMP_NAME_非临时用电
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406030,12.0,12.0,0.0,24,8431,8431,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406094,2.0,2.0,1.0,24,8310,8310,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406097,4.0,4.0,1.0,36,8341,8341,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
179406099,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2851323065,12.0,12.0,0.0,120,780,780,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1


In [50]:
cons_info_test["VOLT_NAME_交流10kV"] = cons_info_test["VOLT_NAME_交流10kV"] + cons_info_test["VOLT_NAME_交流6kV"]
cons_info_test = cons_info_test.drop(columns=["VOLT_NAME_交流6kV"])
cons_info_test

Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_农业生产用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,...,ELEC_TYPE_NAME_居民生活用电,ELEC_TYPE_NAME_普通工业,ELEC_TYPE_NAME_非居民照明,ELEC_TYPE_NAME_非工业,VOLT_NAME_交流10kV,VOLT_NAME_交流220V,VOLT_NAME_交流380V,PRC_NAME_居民合表电价(1-10千伏）,PRC_NAME_居民合表电价(不满1千伏）,TMP_NAME_非临时用电
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
179406030,12.0,12.0,0.0,24,8431,8431,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
179406094,2.0,2.0,1.0,24,8310,8310,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
179406097,4.0,4.0,1.0,36,8341,8341,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
179406099,2.0,2.0,1.0,24,8431,8431,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1
2851323065,12.0,12.0,0.0,120,780,780,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1


In [51]:
cons_info_test["ELEC_TYPE_NAME_普通工业"] = cons_info_test["ELEC_TYPE_NAME_普通工业"] + cons_info_test["ELEC_TYPE_NAME_大工业用电"]
cons_info_test = cons_info_test.drop(columns=["ELEC_TYPE_NAME_大工业用电"])
cons_info_test["ELEC_TYPE_NAME_非工业"] = cons_info_test["ELEC_TYPE_NAME_非工业"] + cons_info_test["ELEC_TYPE_NAME_农业生产用电"]
cons_info_test = cons_info_test.drop(columns=["ELEC_TYPE_NAME_农业生产用电"])
cons_info_test

Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,ELEC_TYPE_NAME_学校教学和学生生活用电,ELEC_TYPE_NAME_居民生活用电,ELEC_TYPE_NAME_普通工业,ELEC_TYPE_NAME_非居民照明,ELEC_TYPE_NAME_非工业,VOLT_NAME_交流10kV,VOLT_NAME_交流220V,VOLT_NAME_交流380V,PRC_NAME_居民合表电价(1-10千伏）,PRC_NAME_居民合表电价(不满1千伏）,TMP_NAME_非临时用电
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,1,0,0,0,0,0,0,1,0,0,1,1
179406030,12.0,12.0,0.0,24,8431,8431,0,0,1,0,0,0,0,0,0,1,0,0,1,1
179406094,2.0,2.0,1.0,24,8310,8310,0,0,1,0,0,0,0,0,0,1,0,0,1,1
179406097,4.0,4.0,1.0,36,8341,8341,0,0,1,0,0,0,0,0,0,1,0,0,1,1
179406099,2.0,2.0,1.0,24,8431,8431,0,0,1,0,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,0,0,0,0,0,1,0,0,1,1
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,0,0,0,0,0,0,1,0,1,1
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,0,0,0,0,0,0,1,0,1,1
2851323065,12.0,12.0,0.0,120,780,780,0,0,1,0,0,0,0,0,0,1,0,0,1,1


<h3>月电量处理</h3>

In [52]:
cons_info_test["pq_z_mean"] = 0
cons_info_test["pq_z_median"] = 0
cons_info_test["pq_z_std"] = 0
cons_info_test["pq_z_std/mean"] = 0
cons_info_test["pq_z_entropy"] = 0
cons_info_test["pq_fg_distance"] = 0
cons_info_test["pq_gp_diatance"] = 0
cons_info_test["pq_fp_distance"] = 0
cons_info_test["pq_fg_cosine"] = 0
cons_info_test["pq_gp_cosine"] = 0
cons_info_test["pq_fp_cosine"] = 0
cons_info_test["pq_fz_cosine"] = 0
cons_info_test["pq_gz_cosine"] = 0
cons_info_test["pq_pz_cosine"] = 0
cons_info_test["variation_month"] = 0
cons_info_test["pq_z_mean_4500"] = 0
cons_info_test["variation_month_0085"] = 0

cons_info_test["pq_z_max"] = 0
cons_info_test["pq_z_min"] = 0

cons_info_test["entropy_month"] = 0
cons_info_test["std_month"] = 0

cons_info_test["diff_month"] = 0
cons_info_test["devide_month"] = 0

In [53]:
elec_month_test = pd.read_csv("../data/测试组_比特币挖矿_月用电明细（20211217）.csv")
elec_month_test

Unnamed: 0,id,ym,pq_f,pq_g,pq_p,pq_z
0,179438260,202001,244,111,1,356
1,179438260,202002,217,108,-1,324
2,179438260,202003,280,90,0,370
3,179438260,202004,338,118,1,457
4,179438260,202005,297,120,-1,416
...,...,...,...,...,...,...
338333,2880712108,202106,474,675,0,1149
338334,2880712108,202107,342,881,0,1223
338335,2880712108,202108,293,772,0,1065
338336,2880712108,202109,418,570,-1,987


In [54]:
elec_month_test["ym"] = elec_month_test["ym"].astype(str)
elec_month_test

Unnamed: 0,id,ym,pq_f,pq_g,pq_p,pq_z
0,179438260,202001,244,111,1,356
1,179438260,202002,217,108,-1,324
2,179438260,202003,280,90,0,370
3,179438260,202004,338,118,1,457
4,179438260,202005,297,120,-1,416
...,...,...,...,...,...,...
338333,2880712108,202106,474,675,0,1149
338334,2880712108,202107,342,881,0,1223
338335,2880712108,202108,293,772,0,1065
338336,2880712108,202109,418,570,-1,987


In [55]:
for cons_id in tqdm(cons_info_test.index):
    df = elec_month_test[elec_month_test["id"] == cons_id]
    df = df.drop(columns=["id"]).sort_values("ym").set_index("ym")
    df = df.fillna(method="ffill").fillna(method="bfill")
    cons_info_test.loc[cons_id, "pq_z_mean"] = df["pq_z"].mean()
    cons_info_test.loc[cons_id, "pq_z_median"] = df["pq_z"].median()
    cons_info_test.loc[cons_id, "pq_z_std"] = df["pq_z"].std()
    cons_info_test.loc[(cons_info_test.index == cons_id) & (cons_info_test["pq_z_mean"] > 0), "pq_z_std/mean"] = cons_info_test["pq_z_std"] / cons_info_test["pq_z_mean"]
    cons_info_test.loc[cons_id, "pq_z_entropy"] = entropy(df["pq_z"])
    
    cons_info_test.loc[cons_id, "pq_fg_distance"] = distance.euclidean(df["pq_f"], df["pq_g"])
    cons_info_test.loc[cons_id, "pq_gp_diatance"] = distance.euclidean(df["pq_g"], df["pq_p"])
    cons_info_test.loc[cons_id, "pq_fp_distance"] = distance.euclidean(df["pq_f"], df["pq_p"])
    
    cons_info_test.loc[cons_id, "pq_fg_cosine"] = cosine(df["pq_f"], df["pq_g"])
    cons_info_test.loc[cons_id, "pq_gp_cosine"] = cosine(df["pq_g"], df["pq_p"])
    cons_info_test.loc[cons_id, "pq_fp_cosine"] = cosine(df["pq_f"], df["pq_p"])
    cons_info_test.loc[cons_id, "pq_fz_cosine"] = cosine(df["pq_f"], df["pq_z"])
    cons_info_test.loc[cons_id, "pq_gz_cosine"] = cosine(df["pq_g"], df["pq_z"])
    cons_info_test.loc[cons_id, "pq_pz_cosine"] = cosine(df["pq_p"], df["pq_z"])
    
    cons_info_test.loc[cons_id, "variation_month"] = variation(df[["pq_f", "pq_g", "pq_p"]])
    cons_info_test.loc[cons_id, "pq_z_mean_4500"] = (cons_info_test.loc[cons_id, "pq_z_mean"] > 4500).astype(float)
    cons_info_test.loc[cons_id, "variation_month_0085"] = (cons_info_test.loc[cons_id, "variation_month"] < 0.085).astype(float)

    cons_info_test.loc[cons_id, "pq_z_max"] = df["pq_z"].max()
    cons_info_test.loc[cons_id, "pq_z_min"] = df["pq_z"].min()
    
    cons_info_test.loc[cons_id, "entropy_month"] = entropy_df(df[["pq_f", "pq_g", "pq_p"]])
    cons_info_test.loc[cons_id, "std_month"] = std_df(df[["pq_f", "pq_g", "pq_p"]])
    
    cons_info_test.loc[cons_id, "diff_month"] = diff_median(df[["pq_f", "pq_g", "pq_p"]])
    cons_info_test.loc[cons_id, "devide_month"] = devide_median(df[["pq_f", "pq_g", "pq_p"]])  
    
    
cons_info_test

100%|██████████████████████████████████████████████████████████████████████████████| 15379/15379 [05:08<00:00, 49.82it/s]


Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,ELEC_TYPE_NAME_学校教学和学生生活用电,...,pq_pz_cosine,variation_month,pq_z_mean_4500,variation_month_0085,pq_z_max,pq_z_min,entropy_month,std_month,diff_month,devide_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,1,0,...,0.964811,0.980133,0.0,0.0,1366,587,1.035599,262.464781,934.5,5.250582
179406030,12.0,12.0,0.0,24,8431,8431,0,0,1,0,...,0.000000,1.732051,0.0,0.0,865,131,0.636514,203.463484,611.0,0.000000
179406094,2.0,2.0,1.0,24,8310,8310,0,0,1,0,...,1.105591,1.496756,0.0,0.0,984,483,0.867563,294.951725,1006.5,29.204981
179406097,4.0,4.0,1.0,36,8341,8341,0,0,1,0,...,1.140851,1.014585,0.0,0.0,1299,360,1.077608,221.420533,811.5,5.556705
179406099,2.0,2.0,1.0,24,8431,8431,0,0,1,0,...,1.072008,0.969216,0.0,0.0,804,80,1.077608,128.638531,496.0,5.175580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,...,0.853454,1.253312,0.0,0.0,554,125,1.056603,94.071420,313.5,0.684623
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,...,0.000000,1.000000,0.0,0.0,0,0,0.000000,0.000000,0.0,0.000000
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,...,0.001275,0.280995,1.0,0.0,40812,0,0.880640,1957.498271,7118.0,4.984970
2851323065,12.0,12.0,0.0,120,780,780,0,0,1,0,...,0.765847,1.201164,0.0,0.0,739,0,0.749054,91.631427,259.5,11.510417


<h3>日电量处理</h3>

In [56]:
cons_info_test["kwh_mean"] = 0
cons_info_test["kwh_median"] = 0
cons_info_test["kwh_std"] = 0
cons_info_test["kwh_std/mean"] = 0
cons_info_test["kwh_entropy"] = 0

cons_info_test["kwh_pap_r23_distance"] = 0
cons_info_test["kwh_pap_r24_distance"] = 0
cons_info_test["kwh_pap_r34_distance"] = 0

cons_info_test["kwh_pap_r23_cosine"] = 0
cons_info_test["kwh_pap_r24_cosine"] = 0
cons_info_test["kwh_pap_r34_cosine"] = 0

cons_info_test["kwh_pap_r1_cosine"] = 0
cons_info_test["kwh_pap_r2_cosine"] = 0
cons_info_test["kwh_pap_r3_cosine"] = 0
cons_info_test["kwh_pap_r4_cosine"] = 0
cons_info_test["variation_day"] = 0

cons_info_test["kwh_max"] = 0
cons_info_test["kwh_min"] = 0

cons_info_test["entropy_day"] = 0
cons_info_test["std_day"] = 0

cons_info_test["diff_day"] = 0
cons_info_test["devide_day"] = 0

In [57]:
elec_day_test = pd.read_csv("../data/测试组_比特币挖矿_日用电明细（20211217）.csv")
elec_day_test

Unnamed: 0,id,rq,kwh,kwh_rap,kwh_pap_r1,kwh_pap_r2,kwh_pap_r3,kwh_pap_r4
0,179404030,2020-01-22 00:00:00,51.77,,0.0,33.47,0.0,18.31
1,179404030,2020-01-23 00:00:00,73.27,,0.0,51.83,0.0,21.44
2,179404030,2020-01-24 00:00:00,74.90,,0.0,52.20,0.0,22.70
3,179404030,2020-01-25 00:00:00,62.73,,0.0,37.27,0.0,25.45
4,179404030,2020-01-26 00:00:00,64.27,,0.0,41.72,0.0,22.56
...,...,...,...,...,...,...,...,...
1660582,2880712108,2021-10-05 00:00:00,44.85,,0.0,22.14,0.0,22.71
1660583,2880712108,2021-10-06 00:00:00,28.89,,0.0,14.94,0.0,13.95
1660584,2880712108,2021-10-07 00:00:00,51.10,,0.0,27.27,0.0,23.84
1660585,2880712108,2021-10-08 00:00:00,17.96,,0.0,7.19,0.0,10.76


In [58]:
elec_day_test["rq"] = pd.to_datetime(elec_day_test["rq"])
elec_day_test

Unnamed: 0,id,rq,kwh,kwh_rap,kwh_pap_r1,kwh_pap_r2,kwh_pap_r3,kwh_pap_r4
0,179404030,2020-01-22,51.77,,0.0,33.47,0.0,18.31
1,179404030,2020-01-23,73.27,,0.0,51.83,0.0,21.44
2,179404030,2020-01-24,74.90,,0.0,52.20,0.0,22.70
3,179404030,2020-01-25,62.73,,0.0,37.27,0.0,25.45
4,179404030,2020-01-26,64.27,,0.0,41.72,0.0,22.56
...,...,...,...,...,...,...,...,...
1660582,2880712108,2021-10-05,44.85,,0.0,22.14,0.0,22.71
1660583,2880712108,2021-10-06,28.89,,0.0,14.94,0.0,13.95
1660584,2880712108,2021-10-07,51.10,,0.0,27.27,0.0,23.84
1660585,2880712108,2021-10-08,17.96,,0.0,7.19,0.0,10.76


In [59]:
for cons_id in tqdm(cons_info_test.index):
    df = elec_day_test[elec_day_test["id"] == cons_id]
    if df.shape[0] == 0:
        continue
    df = df.drop(columns=["id"]).sort_values("rq").set_index("rq")
    df = df.fillna(method="ffill").fillna(method="bfill").fillna(0)
    cons_info_test.loc[cons_id, "kwh_mean"] = df["kwh"].mean()
    cons_info_test.loc[cons_id, "kwh_median"] = df["kwh"].median()
    cons_info_test.loc[cons_id, "kwh_std"] = df["kwh"].std()
    cons_info_test.loc[(cons_info_test.index == cons_id) & (cons_info_test["kwh_mean"] > 0), "kwh_std/mean"] = cons_info_test["kwh_std"] / cons_info_test["kwh_mean"]
    cons_info_test.loc[cons_id, "kwh_entropy"] = entropy(df["kwh"])

    cons_info_test.loc[cons_id, "kwh_pap_r23_distance"] = distance.euclidean(df["kwh_pap_r2"], df["kwh_pap_r3"])
    cons_info_test.loc[cons_id, "kwh_pap_r24_distance"] = distance.euclidean(df["kwh_pap_r2"], df["kwh_pap_r4"])
    cons_info_test.loc[cons_id, "kwh_pap_r34_distance"] = distance.euclidean(df["kwh_pap_r3"], df["kwh_pap_r4"])

    cons_info_test.loc[cons_id, "kwh_pap_r23_cosine"] = cosine(df["kwh_pap_r2"], df["kwh_pap_r3"])
    cons_info_test.loc[cons_id, "kwh_pap_r24_cosine"] = cosine(df["kwh_pap_r2"], df["kwh_pap_r4"])
    cons_info_test.loc[cons_id, "kwh_pap_r34_cosine"] = cosine(df["kwh_pap_r3"], df["kwh_pap_r4"])
    
    cons_info_test.loc[cons_id, "kwh_pap_r1_cosine"] = cosine(df["kwh_pap_r1"], df["kwh"])
    cons_info_test.loc[cons_id, "kwh_pap_r2_cosine"] = cosine(df["kwh_pap_r2"], df["kwh"])
    cons_info_test.loc[cons_id, "kwh_pap_r3_cosine"] = cosine(df["kwh_pap_r3"], df["kwh"])
    cons_info_test.loc[cons_id, "kwh_pap_r4_cosine"] = cosine(df["kwh_pap_r4"], df["kwh"])
    
    cons_info_test.loc[cons_id, "variation_day"] = variation(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    
    cons_info_test.loc[cons_id, "kwh_max"] = df["kwh"].max()
    cons_info_test.loc[cons_id, "kwh_min"] = df["kwh"].min()
    
    cons_info_test.loc[cons_id, "entropy_day"] = entropy_df(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    cons_info_test.loc[cons_id, "std_day"] = std_df(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    
    cons_info_test.loc[cons_id, "diff_day"] = diff_median(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    cons_info_test.loc[cons_id, "devide_day"] = devide_median(df[["kwh_pap_r2", "kwh_pap_r3", "kwh_pap_r4"]])
    
    
cons_info_test

100%|██████████████████████████████████████████████████████████████████████████████| 15379/15379 [08:57<00:00, 28.63it/s]


Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,ELEC_TYPE_NAME_学校教学和学生生活用电,...,kwh_pap_r2_cosine,kwh_pap_r3_cosine,kwh_pap_r4_cosine,variation_day,kwh_max,kwh_min,entropy_day,std_day,diff_day,devide_day
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,1,0,...,0.015134,1.000000,0.040248,1.018344,44.74,8.14,1.051547,8.175570,30.470,4.576500e+08
179406030,12.0,12.0,0.0,24,8431,8431,0,0,1,0,...,1.000000,0.000000,1.000000,1.732051,27.12,1.26,0.636514,5.150820,13.980,0.000000e+00
179406094,2.0,2.0,1.0,24,8310,8310,0,0,1,0,...,0.000859,1.000000,0.080452,1.504231,26.22,13.38,0.807662,9.365858,33.140,5.034000e+08
179406097,4.0,4.0,1.0,36,8341,8341,0,0,1,0,...,0.016689,1.000000,0.044281,1.044709,40.02,8.38,1.047268,7.110605,26.410,4.102500e+08
179406099,2.0,2.0,1.0,24,8431,8431,0,0,1,0,...,0.009680,1.000000,0.035987,1.016197,39.62,0.10,1.064383,3.716477,11.880,1.726500e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,...,0.402164,1.000000,0.064085,1.193530,39.04,0.00,0.245352,3.042272,0.000,0.000000e+00
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,...,0.000000,0.000000,0.000000,1.000000,0.00,0.00,0.000000,0.000000,0.000,0.000000e+00
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,...,0.000969,0.001243,0.004624,0.240959,1148.00,472.00,0.991645,60.598379,211.000,3.282523e+00
2851323065,12.0,12.0,0.0,120,780,780,0,0,1,0,...,0.004505,1.000000,0.133719,1.218578,24.94,0.00,0.649498,2.426968,4.065,6.540000e+07


In [60]:
cons_info_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15379 entries, 179406029 to 2852368013
Data columns (total 65 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   CONTRACT_CAP                15379 non-null  float64
 1   RUN_CAP                     15379 non-null  float64
 2   SHIFT_NO                    15379 non-null  float64
 3   CHK_CYCLE                   15379 non-null  int64  
 4   live_days                   15379 non-null  int64  
 5   check_days                  15379 non-null  int64  
 6   ELEC_TYPE_NAME_乡村居民生活用电     15379 non-null  uint8  
 7   ELEC_TYPE_NAME_商业用电         15379 non-null  uint8  
 8   ELEC_TYPE_NAME_城镇居民生活用电     15379 non-null  uint8  
 9   ELEC_TYPE_NAME_学校教学和学生生活用电  15379 non-null  uint8  
 10  ELEC_TYPE_NAME_居民生活用电       15379 non-null  uint8  
 11  ELEC_TYPE_NAME_普通工业         15379 non-null  uint8  
 12  ELEC_TYPE_NAME_非居民照明        15379 non-null  uint8  
 13  ELEC_TYPE_NAME_非工业

<h2>重新排列测试集</h2>

In [61]:
cons_info_test = cons_info_test.reindex(columns=cons_info.columns).drop(columns=["IS_FLAG"])
cons_info_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15379 entries, 179406029 to 2852368013
Data columns (total 65 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   CONTRACT_CAP                15379 non-null  float64
 1   RUN_CAP                     15379 non-null  float64
 2   SHIFT_NO                    15379 non-null  float64
 3   CHK_CYCLE                   15379 non-null  int64  
 4   live_days                   15379 non-null  int64  
 5   check_days                  15379 non-null  int64  
 6   ELEC_TYPE_NAME_乡村居民生活用电     15379 non-null  uint8  
 7   ELEC_TYPE_NAME_商业用电         15379 non-null  uint8  
 8   ELEC_TYPE_NAME_城镇居民生活用电     15379 non-null  uint8  
 9   ELEC_TYPE_NAME_学校教学和学生生活用电  15379 non-null  uint8  
 10  ELEC_TYPE_NAME_居民生活用电       15379 non-null  uint8  
 11  ELEC_TYPE_NAME_普通工业         15379 non-null  uint8  
 12  ELEC_TYPE_NAME_非居民照明        15379 non-null  uint8  
 13  ELEC_TYPE_NAME_非工业

In [62]:
x_test = ss.transform(cons_info_test.values)
x_test.shape

(15379, 65)

In [63]:
x_test

array([[-3.45704680e-01, -3.53323746e-01,  1.05812375e+00, ...,
         1.37993263e-01,  1.52911187e-01,  9.56777447e-01],
       [-1.80218712e-01, -1.82563158e-01, -8.68114707e-01, ...,
         6.50386425e-03, -3.65129743e-02, -6.71231005e-01],
       [-3.45704680e-01, -3.53323746e-01,  1.05812375e+00, ...,
         1.89736461e-01,  1.83582049e-01,  1.11952493e+00],
       ...,
       [ 1.15361878e+01,  1.19072865e+01, -8.68114707e-01, ...,
         2.41687345e+00,  2.22669796e+00, -6.71230993e-01],
       [-1.80218712e-01, -1.82563158e-01, -8.68114707e-01, ...,
        -1.11905119e-01, -1.50408703e-01, -4.38582140e-01],
       [ 4.89999459e-01,  5.09017223e-01, -8.68114707e-01, ...,
         3.45689846e-03, -2.97030900e-03, -6.71230994e-01]])

<h2>预测数据</h2>

In [64]:
y_test = model.predict(x_test)
cons_info_test["label"] = y_test
cons_info_test

Unnamed: 0_level_0,CONTRACT_CAP,RUN_CAP,SHIFT_NO,CHK_CYCLE,live_days,check_days,ELEC_TYPE_NAME_乡村居民生活用电,ELEC_TYPE_NAME_商业用电,ELEC_TYPE_NAME_城镇居民生活用电,ELEC_TYPE_NAME_学校教学和学生生活用电,...,kwh_pap_r3_cosine,kwh_pap_r4_cosine,variation_day,kwh_max,kwh_min,entropy_day,std_day,diff_day,devide_day,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
179406029,2.0,2.0,1.0,24,8431,8431,0,0,1,0,...,1.000000,0.040248,1.018344,44.74,8.14,1.051547,8.175570,30.470,4.576500e+08,0
179406030,12.0,12.0,0.0,24,8431,8431,0,0,1,0,...,0.000000,1.000000,1.732051,27.12,1.26,0.636514,5.150820,13.980,0.000000e+00,0
179406094,2.0,2.0,1.0,24,8310,8310,0,0,1,0,...,1.000000,0.080452,1.504231,26.22,13.38,0.807662,9.365858,33.140,5.034000e+08,0
179406097,4.0,4.0,1.0,36,8341,8341,0,0,1,0,...,1.000000,0.044281,1.044709,40.02,8.38,1.047268,7.110605,26.410,4.102500e+08,0
179406099,2.0,2.0,1.0,24,8431,8431,0,0,1,0,...,1.000000,0.035987,1.016197,39.62,0.10,1.064383,3.716477,11.880,1.726500e+08,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849971818,12.0,12.0,0.0,120,783,783,1,0,0,0,...,1.000000,0.064085,1.193530,39.04,0.00,0.245352,3.042272,0.000,0.000000e+00,0
2850017469,120.0,120.0,0.0,120,773,773,1,0,0,0,...,0.000000,0.000000,1.000000,0.00,0.00,0.000000,0.000000,0.000,0.000000e+00,0
2850017472,720.0,720.0,0.0,120,773,773,1,0,0,0,...,0.001243,0.004624,0.240959,1148.00,472.00,0.991645,60.598379,211.000,3.282523e+00,0
2851323065,12.0,12.0,0.0,120,780,780,0,0,1,0,...,1.000000,0.133719,1.218578,24.94,0.00,0.649498,2.426968,4.065,6.540000e+07,0


In [65]:
cons_info_test.index.name = "id"
cons_info_test[["label"]].to_csv("../data/result20220223-temp.csv", index=True)