In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
def merge_new_fea(full, new_fea_path, usecols, merge_on_col, basic_cols=[]):
    if usecols: 
        assert len(usecols) == len(set(usecols))
        assert len(set(merge_on_col) - set(usecols)) == 0
    if type(merge_on_col) != list: merge_on_col = [merge_on_col]
    if usecols:
        new_fea = pd.read_csv(new_fea_path, usecols=usecols, nrows=1)
    else:
        new_fea = pd.read_csv(new_fea_path, nrows=1)
    new_fea_cols = list(set(new_fea.columns) - set(merge_on_col))    
    assert len(set(new_fea_cols) & set(full.columns)) == 0
    
    if len(basic_cols) > 0:
        new_fea_cols = list(set(new_fea_cols) & set(basic_cols))
    print(len(new_fea_cols+merge_on_col))
    new_fea = pd.read_csv(new_fea_path, usecols=new_fea_cols+merge_on_col)
    
    before_len = len(full)
    full = full.merge(new_fea, on=merge_on_col, how='left')
    assert len(full) == before_len
    print(full.shape)
    return full, new_fea_cols


def get_full(need_cols):
    ord_train = pd.read_csv('./data/train/ord_train.csv')
    ord_train.drop(['commission', 'confirmdate', 'orderstatus', 'ordroomnum', 'price'], axis=1, inplace=True)
    
    ord_test = pd.read_csv('./data/test/ord_testA.csv', encoding = "GB2312")
    ord_test_a_len = ord_test.shape[0]
    ord_test = pd.concat([ord_test, pd.read_csv('./data/test/ord_testB.csv', encoding = "GB2312")])
    ord_test_b_len = ord_test.shape[0] - ord_test_a_len

    ord_test['arrival'] = ord_test['arrival'].apply(lambda x:x.replace('/', '-'))
    ord_test['orderdate'] = ord_test['orderdate'].apply(lambda x:x.replace('/', '-'))
    ord_test['etd'] = ord_test['etd'].apply(lambda x:x.replace('/', '-'))  
    
    full = pd.concat([ord_train, ord_test])
    
    test_slice = slice(ord_train.shape[0], ord_train.shape[0]+ord_test.shape[0])
    test_a_slice = slice(-(ord_test_a_len + ord_test_b_len), -ord_test_b_len)
    test_b_slice = slice(-ord_test_b_len, None)
    del (ord_train, ord_test)
    
    full['orderdate'] = pd.to_datetime(full['orderdate'], format='%Y-%m-%d %H:%M')
    full['arrival'] = pd.to_datetime(full['arrival'], format='%Y-%m-%d %H:%M')
    full['etd'] = pd.to_datetime(full['etd'], format='%Y-%m-%d %H:%M')

    object_cols = ['hotelbelongto', 'isebookinghtl', 'isholdroom', 'supplierchannel']    
#    one-hot
    for col in object_cols:
        onehot_feas = pd.get_dummies(full[col])
        onehot_feas.columns = ['%s_%s' % (col, val) for val in full[col].unique()]
        full = pd.concat([full, onehot_feas], axis=1)
        
#     label encoder
    from sklearn.preprocessing import LabelEncoder
    for col in object_cols:
        le = LabelEncoder()
        le.fit(full[col])
#         print(le.classes_, le.transform(le.classes_))
        full[col] = le.transform(full[col])


    hist_flag = full['orderdate'].apply(lambda x:x.month) <= 8
    full = full[~hist_flag]
    full.reset_index(drop=True, inplace=True)

    new_fea = pd.read_csv('./data/train/hotelinfo.csv', usecols=['hotel', 'totalrooms'])
    new_fea = new_fea.rename(index=str, columns={'totalrooms':'hotel_totalrooms'})
    full = full.merge(new_fea, on='hotel', how='left')
    
    new_fea = pd.read_csv('./data/train/mroominfo.csv')
    new_fea = new_fea.rename(index=str, columns={'totalrooms':'mroom_totalrooms'})
    full = full.merge(new_fea, on=['masterbasicroomid', 'masterhotelid'], how='left')
    full['masterbasicroomid'] = full['masterbasicroomid'].astype(float)
    print(full.shape)

    
    
    full, watch_cols = merge_new_fea(full, './data/train/mhotelinfo.csv', usecols=['masterhotelid', 'glon', 'glat', 'star'], 
                     merge_on_col=['masterhotelid'], basic_cols=need_cols)

    zero_good_feas = ['hotel_hot','pos_hotel_isholdroom_mean_sub','pos_hotel_orderdate_mon_max','pos_hotel_ordadvanceday_median','pos_hotel_arrival_mon_mean','pos_hotel_isvendor_mean_sub','pos_hotel_isholdroom_mean','pos_hotel_ordadvanceday_max','pos_hotel_supplierchannel_mean_sub','pos_hotel_isebookinghtl_median_sub','pos_hotel_supplierchannel_median_sub','pos_hotel_hotelstar_median_sub','pos_hotel_hotelbelongto_mean_sub','pos_hotel_hotelbelongto_median_sub','pos_hotel_isebookinghtl_mean_sub','pos_hotel_hotelstar_mean_sub','nag_hotel_etd_mon_var','nag_masterhotelid_etd_mon_max','nag_hotel_orderdate_mon_mean','pos_hotel_etd_mon_max','pos_hotel_etd_mon_min','pos_hotel_orderdate_mon_median','nag_masterhotelid_arrival_mon_max','pos_hotel_orderdate_mon_mean_sub','pos_hotel_arrival_mon_max','pos_hotel_etd_mon_mean_sub','pos_hotel_orderdate_mon_median_sub','pos_hotel_orderdate_mon_var','pos_hotel_isholdroom_median','pos_hotel_ordadvanceday_mean','supplierid_his_daysum','hotel_num','pos_hotel_hotelbelongto_median','pos_hotel_hotelbelongto_min','pos_hotel_hotelbelongto_mean','nag_hotel_ordadvanceday_max','nag_hotel_isholdroom_min','pos_hotel_isholdroom_median_sub','pos_hotel_ordadvanceday_var','nag_hotel_arrival_mon_max','nag_hotel_isvendor_var','supplierid_num','nag_hotel_orderdate_mon_var','pos_hotel_isvendor_min','nag_hotel_ordadvanceday_var','nag_masterhotelid_orderdate_mon_median_sub','nag_masterhotelid_etd_mon_var','pos_hotel_isholdroom_var','nag_hotel_isebookinghtl_mean_sub','nag_hotel_supplierchannel_median_sub','nag_hotel_hotelstar_median_sub','nag_hotel_hotelbelongto_median_sub','nag_hotel_isebookinghtl_median_sub','nag_hotel_supplierchannel_mean_sub','nag_hotel_hotelbelongto_mean_sub','nag_hotel_hotelstar_mean_sub','nag_hotel_supplierchannel_max','nag_hotel_supplierchannel_mean','nag_hotel_supplierchannel_median','nag_hotel_supplierchannel_min','nag_hotel_isholdroom_var','nag_masterhotelid_isvendor_mean_sub','nag_hotel_orderdate_mon_mean_sub','pos_hotel_etd_mon_median','nag_hotel_etd_mon_max','pos_hotel_isholdroom_max','supplierid_hot','nag_hotel_etd_mon_median','hotel_his_daysum','nag_masterhotelid_supplierchannel_mean','nag_hotel_arrival_mon_median','pos_supplierid_ordadvanceday_mean','nag_supplierid_ordadvanceday_median','pos_hotel_etd_mon_mean','pos_hotel_arrival_mon_min','nag_hotel_orderdate_mon_median','pos_hotel_arrival_mon_median_sub','pos_hotel_arrival_mon_var','pos_hotel_isholdroom_min','nag_hotel_arrival_mon_mean','pos_hotel_etd_mon_median_sub','pos_masterhotelid_hotelbelongto_var','nag_hotel_isholdroom_max','pos_hotel_supplierchannel_median','pos_hotel_supplierchannel_mean','pos_hotel_supplierchannel_min','pos_hotel_supplierchannel_max','pos_hotel_isvendor_mean','pos_hotel_ordadvanceday_min','pos_hotel_orderdate_mon_min','pos_hotel_isvendor_median_sub','nag_hotel_orderdate_mon_min','pos_hotel_isvendor_median','nag_hotel_arrival_mon_var','nag_hotel_isvendor_max','pos_room_isholdroom_max','pos_masterhotelid_etd_mon_max','nag_masterhotelid_etd_mon_median','nag_masterhotelid_ordadvanceday_max','nag_hotel_orderdate_mon_max','pos_masterhotelid_isvendor_var','pos_masterhotelid_supplierchannel_var','nag_masterhotelid_supplierchannel_mean_sub','nag_hotel_ordadvanceday_median_sub','room_hot','pos_masterhotelid_hotelbelongto_mean','nag_hotel_isholdroom_mean','pos_hotel_arrival_mon_median','nag_hotel_arrival_mon_median_sub','pos_masterhotelid_supplierchannel_mean','nag_masterhotelid_isebookinghtl_mean','nag_masterhotelid_arrival_mon_var','pos_hotel_hotelstar_median','pos_hotel_hotelstar_max','pos_hotel_hotelstar_mean','pos_hotel_hotelstar_min','pos_hotel_etd_mon_var','hotelorder_maxdate_sub','nag_hotel_arrival_mon_min','pos_supplierid_arrival_mon_max','pos_hotel_isvendor_max','nag_masterhotelid_orderdate_mon_mean','nag_masterhotelid_hotelbelongto_var','nag_masterhotelid_ordadvanceday_var','pos_hotel_isvendor_var','pos_hotel_isebookinghtl_median','pos_hotel_isebookinghtl_min','pos_hotel_isebookinghtl_mean','pos_hotel_isebookinghtl_max','nag_masterhotelid_isholdroom_min','pos_supplierid_orderdate_mon_mean','nag_masterhotelid_isebookinghtl_median','pos_hotel_orderdate_mon_mean','nag_masterhotelid_arrival_mon_median','nag_masterhotelid_isvendor_mean','pos_room_isholdroom_median_sub','pos_hotel_ordadvanceday_mean_sub','pos_masterhotelid_ordadvanceday_max','nag_hotel_isvendor_mean','pos_hotel_ordadvanceday_median_sub','pos_supplierid_orderdate_mon_mean_sub','pos_masterhotelid_supplierchannel_median_sub','pos_masterhotelid_isvendor_mean_sub','pos_masterhotelid_orderdate_mon_mean_sub','nag_hotel_isholdroom_mean_sub','pos_masterhotelid_isebookinghtl_mean','nag_supplierid_etd_mon_mean','pos_supplierid_arrival_mon_mean','nag_hotel_isebookinghtl_var','nag_hotel_hotelbelongto_var','nag_hotel_hotelstar_var','nag_hotel_supplierchannel_var','nag_hotel_isvendor_median_sub','nag_supplierid_orderdate_mon_var','nag_masterhotelid_orderdate_mon_mean_sub','pos_masterhotelid_isvendor_max','nag_masterhotelid_hotelbelongto_mean','nag_masterhotelid_arrival_mon_min','pos_supplierid_hotelstar_var','nag_hotel_isebookinghtl_median','nag_hotel_isebookinghtl_min','nag_hotel_isebookinghtl_max','nag_hotel_isebookinghtl_mean','pos_masterhotelid_isebookinghtl_mean_sub','nag_hotel_ordadvanceday_median','masterhotelid_hot','pos_room_isholdroom_mean','nag_masterhotelid_arrival_mon_median_sub','nag_masterhotelid_orderdate_mon_var','nag_masterhotelid_hotelbelongto_mean_sub','pos_city_hotelbelongto_median','pos_masterhotelid_arrival_mon_min','supplieridorder_mindate_sub','pos_city_hotelbelongto_mean','pos_masterhotelid_orderdate_mon_mean','nag_masterhotelid_isholdroom_median_sub','nag_masterbasicroomid_hotelbelongto_mean','nag_supplierid_orderdate_mon_median_sub','pos_room_supplierchannel_mean','pos_room_supplierchannel_median','pos_room_supplierchannel_max','pos_room_supplierchannel_min','nag_hotel_isvendor_median','nag_masterhotelid_hotelstar_max','nag_hotel_arrival_mon_mean_sub','nag_city_isholdroom_var','pos_masterhotelid_etd_mon_min','nag_supplierid_hotelstar_var','pos_masterhotelid_isvendor_median_sub','nag_masterhotelid_hotelstar_mean_sub','pos_countryid_hotelstar_median_sub','nag_supplierid_isvendor_median','pos_masterhotelid_ordadvanceday_mean','pos_supplierid_hotelbelongto_min','nag_supplierid_isholdroom_mean_sub','pos_countryid_hotelbelongto_mean_sub','nag_masterhotelid_etd_mon_min','nag_city_arrival_mon_median_sub','pos_countryid_isvendor_mean_sub','pos_hotel_isebookinghtl_var','pos_hotel_supplierchannel_var','pos_hotel_hotelstar_var','pos_hotel_hotelbelongto_var','pos_masterbasicroomid_isholdroom_median','nag_masterhotelid_ordadvanceday_median_sub','pos_supplierid_ordadvanceday_median','nag_masterbasicroomid_arrival_mon_var','pos_countryid_hotelstar_mean_sub','nag_hotel_etd_mon_median_sub','pos_masterhotelid_isvendor_mean','pos_supplierid_etd_mon_mean','pos_masterhotelid_isvendor_median','pos_masterhotelid_hotelbelongto_mean_sub','nag_masterhotelid_supplierchannel_min','pos_room_orderdate_mon_max','pos_supplierid_hotelstar_mean','nag_supplierid_hotelstar_median','pos_city_hotelstar_mean','pos_room_isholdroom_median','supplierchannel_nag_hot','supplierchannel_pos_hot','nag_masterhotelid_orderdate_mon_median','pos_supplierid_orderdate_mon_median_sub','pos_masterhotelid_etd_mon_var','nag_masterhotelid_isholdroom_median','pos_countryid_isebookinghtl_median_sub','nag_masterhotelid_orderdate_mon_min','nag_city_orderdate_mon_median_sub','pos_masterbasicroomid_isebookinghtl_var','nag_masterbasicroomid_hotelbelongto_median_sub','nag_hotel_hotelbelongto_median','nag_hotel_hotelbelongto_max','nag_hotel_hotelbelongto_min','nag_hotel_hotelbelongto_mean','pos_masterhotelid_ordadvanceday_median','pos_supplierid_isholdroom_mean','pos_masterhotelid_supplierchannel_median','pos_masterhotelid_hotelbelongto_max','nag_masterhotelid_isebookinghtl_mean_sub','pos_room_isebookinghtl_median_sub','pos_room_supplierchannel_mean_sub','pos_room_hotelstar_median_sub','pos_room_isvendor_mean_sub','pos_room_hotelbelongto_mean_sub','pos_room_hotelbelongto_median_sub','pos_room_hotelstar_mean_sub','pos_room_isvendor_median_sub','pos_room_isebookinghtl_mean_sub','pos_room_supplierchannel_median_sub','nag_masterhotelid_hotelbelongto_max','nag_supplierid_isebookinghtl_mean','nag_supplierid_arrival_mon_var','hotelorder_mindate_sub','nag_supplierid_arrival_mon_median','pos_masterhotelid_isebookinghtl_median_sub','nag_city_arrival_mon_max','nag_supplierid_etd_mon_var','pos_room_etd_mon_max','nag_supplierid_orderdate_mon_mean_sub','nag_masterhotelid_ordadvanceday_mean_sub','nag_room_isebookinghtl_median_sub','nag_room_supplierchannel_median_sub','nag_room_hotelstar_mean_sub','nag_room_hotelbelongto_mean_sub','nag_room_hotelstar_median_sub','nag_room_supplierchannel_mean_sub','nag_room_isvendor_median_sub','nag_room_isvendor_mean_sub','nag_room_isebookinghtl_mean_sub','nag_room_hotelbelongto_median_sub','pos_countryid_hotelstar_mean','nag_city_arrival_mon_var','nag_masterhotelid_hotelbelongto_min','pos_countryid_ordadvanceday_median_sub','pos_supplierid_hotelbelongto_var','nag_room_etd_mon_median','pos_masterhotelid_hotelbelongto_median','cityorder_mindate_sub','nag_masterhotelid_isholdroom_mean','pos_masterbasicroomid_etd_mon_var','pos_room_arrival_mon_median_sub','masterhotelid_num','nag_masterhotelid_isholdroom_mean_sub','nag_supplierid_ordadvanceday_var','nag_masterhotelid_hotelbelongto_median_sub','nag_countryid_isholdroom_var','nag_supplierid_ordadvanceday_mean','pos_masterhotelid_isvendor_min','nag_city_orderdate_mon_median','nag_countryid_supplierchannel_mean_sub','pos_masterbasicroomid_ordadvanceday_mean','nag_masterbasicroomid_supplierchannel_median','pos_masterhotelid_supplierchannel_mean_sub','pos_supplierid_ordadvanceday_var','nag_masterhotelid_hotelstar_median_sub','pos_masterhotelid_orderdate_mon_max','pos_supplierid_ordadvanceday_max','pos_masterhotelid_arrival_mon_max','nag_countryid_isebookinghtl_mean_sub','nag_countryid_isholdroom_mean','pos_supplierid_etd_mon_var','pos_masterhotelid_isholdroom_median_sub','pos_city_ordadvanceday_var','pos_masterbasicroomid_isebookinghtl_mean_sub','pos_supplierid_arrival_mon_var','pos_supplierid_isholdroom_var','pos_countryid_hotelbelongto_median_sub','nag_masterbasicroomid_hotelbelongto_max','pos_room_etd_mon_median_sub','nag_masterhotelid_isebookinghtl_median_sub','nag_city_isvendor_median_sub','nag_hotel_ordadvanceday_mean','pos_room_hotelbelongto_median','pos_room_hotelbelongto_min','pos_room_hotelbelongto_max','pos_room_hotelbelongto_mean','pos_supplierid_etd_mon_min','pos_countryid_isebookinghtl_median','nag_supplierid_orderdate_mon_median','pos_masterbasicroomid_arrival_mon_median','pos_city_arrival_mon_mean_sub','pos_countryid_supplierchannel_min','nag_masterhotelid_supplierchannel_max','pos_room_etd_mon_median','nag_masterhotelid_isebookinghtl_max','nag_masterhotelid_hotelstar_median','pos_masterhotelid_hotelbelongto_median_sub','nag_masterhotelid_etd_mon_mean','pos_countryid_isvendor_min','room_num','pos_masterhotelid_isebookinghtl_median','pos_masterhotelid_hotelstar_max','nag_supplierid_supplierchannel_mean','nag_hotel_ordadvanceday_mean_sub','nag_masterhotelid_supplierchannel_median_sub','pos_city_ordadvanceday_median_sub','pos_masterbasicroomid_hotelbelongto_mean_sub','pos_masterbasicroomid_isholdroom_median_sub','nag_city_ordadvanceday_max','nag_masterhotelid_isholdroom_var','pos_city_etd_mon_mean','pos_room_arrival_mon_median','nag_supplierid_isvendor_min','pos_supplierid_etd_mon_median','nag_masterhotelid_ordadvanceday_mean','pos_masterhotelid_hotelbelongto_min','pos_masterhotelid_orderdate_mon_var','pos_masterhotelid_orderdate_mon_median','nag_masterbasicroomid_isebookinghtl_max','nag_city_hotelstar_mean','hotelstar_nag_hot','nag_room_isvendor_median','nag_room_isvendor_min','nag_room_isvendor_mean','nag_room_isvendor_max','zq_num','pos_room_isebookinghtl_min','pos_room_isebookinghtl_max','pos_room_isebookinghtl_mean','pos_room_isebookinghtl_median','pos_supplierid_isvendor_min','pos_masterbasicroomid_etd_mon_median','pos_city_etd_mon_min','pos_masterbasicroomid_ordadvanceday_median_sub','nag_masterbasicroomid_arrival_mon_min','nag_city_isholdroom_max','pos_room_etd_mon_min','nag_countryid_hotelbelongto_mean_sub','pos_supplierid_orderdate_mon_max','pos_room_hotelstar_mean','pos_room_hotelstar_median','pos_room_hotelstar_max','pos_room_hotelstar_min','nag_hotel_orderdate_mon_median_sub','nag_masterhotelid_supplierchannel_var','pos_supplierid_isebookinghtl_mean','nag_masterbasicroomid_hotelstar_max','nag_masterbasicroomid_hotelstar_median_sub','nag_masterhotelid_orderdate_mon_max','pos_masterbasicroomid_isebookinghtl_median_sub','pos_countryid_hotelbelongto_var','nag_countryid_hotelstar_median','pos_supplierid_isholdroom_median','nag_supplierid_hotelbelongto_mean']
    full, watch_cols = merge_new_fea(full, new_fea_path='./features/feature_zero.csv', usecols=zero_good_feas+['orderid'], 
                                merge_on_col=['orderid'], basic_cols=need_cols)
    
    full, watch_cols = merge_new_fea(full, new_fea_path='./features/cjy_hotel_feas.csv', usecols=None, 
                                 merge_on_col=['orderid'], basic_cols=need_cols)
    
    full, watch_cols = merge_new_fea(full, new_fea_path='./features/cjy_config_feas.csv', 
                     usecols=None, merge_on_col=['orderid'], basic_cols=need_cols)

    full, watch_cols = merge_new_fea(full, new_fea_path='./features/cjy_capacity_feas2.csv', 
                     usecols=None, merge_on_col=['orderid'], basic_cols=need_cols)
    
    full, watch_cols = merge_new_fea(full, new_fea_path='./features/cjy_time_feature.csv', 
                     usecols=None, merge_on_col=['orderid'], basic_cols=need_cols)
    full['orderdate_mon'] = full['orderdate'].apply(lambda x:x.month)
    full['orderdate_day'] = full['orderdate'].apply(lambda x:x.day)

    full, watch_cols = merge_new_fea(full, new_fea_path='./features/bayes_rate.csv', 
                                 usecols=None, merge_on_col=['orderid'], basic_cols=need_cols)
    
    zero2_feas = ['actual_day', 'arr_time_flag', 'cityarrival_maxdate_sub', 'cityarrival_mindate_sub', 'cityetd_maxdate_sub', 'cityetd_mindate_sub', 'countryidarrival_maxdate_sub', 'countryidarrival_mindate_sub', 'countryidetd_maxdate_sub', 'countryidetd_mindate_sub', 'hotelarrival_maxdate_sub', 'hotelarrival_mindate_sub', 'hoteletd_maxdate_sub', 'hoteletd_mindate_sub', 'is_equal', 'masterbasicroomidarrival_maxdate_sub', 'masterbasicroomidarrival_mindate_sub', 'masterbasicroomidetd_maxdate_sub', 'masterbasicroomidetd_mindate_sub', 'masterhotelidarrival_maxdate_sub', 'masterhotelidarrival_mindate_sub', 'masterhotelidetd_maxdate_sub', 'masterhotelidetd_mindate_sub', 'order_day', 'order_time_flag', 'roomarrival_maxdate_sub', 'roomarrival_mindate_sub', 'roometd_maxdate_sub', 'roometd_mindate_sub', 'supplieridarrival_maxdate_sub', 'supplieridarrival_mindate_sub', 'supplieridetd_maxdate_sub', 'supplieridetd_mindate_sub', 'totalrooms_x', 'totalrooms_y']
    full, watch_cols = merge_new_fea(full, new_fea_path='./feature/feature_zero2.csv', 
                     usecols=zero2_feas+['orderid'], merge_on_col=['orderid'], basic_cols=need_cols)
    
    full, watch_cols = merge_new_fea(full, new_fea_path='./features/cjy_hot_feas.csv', 
                                 usecols=None, merge_on_col=['orderid'], basic_cols=need_cols)
    
    zero3_feas = ['his_hotel_confirm-order_day_mean', 'his_hotel_confirm-arrival_second_mean', 'his_hotel_confirm-order_second_mean', 'his_hotel_confirm-arrival_day_mean', 'his_hotel_confirm-arrival_day_median', 'his_hotel_orderstatus_mean', 'his_hotel_confirm-order_second_median', 'his_hotel_confirm-arrival_day_min', 'his_hotel_confirm-arrival_second_median', 'his_hotel_confirm-order_day_median', 'his_hotel_confirm-order_second_var', 'pos_hotel_confirm-arrival_day_mean', 'his_hotel_confirm-order_second_min', 'pos_hotel_confirm-order_second_min', 'pos_hotel_orderstatus_mean', 'pos_hotel_confirm-order_second_mean', 'pos_hotel_confirm-arrival_day_median', 'pos_hotel_confirm-order_second_median', 'pos_hotel_price_median', 'pos_hotel_confirm-arrival_second_mean', 'his_hotel_confirm-arrival_second_var', 'pos_hotel_confirm-order_day_mean', 'pos_hotel_confirm-arrival_second_min', 'pos_hotel_confirm-arrival_second_median', 'pos_hotel_commission/ordroomnum_median', 'pos_hotel_orderstatus_median', 'his_hotel_price_mean', 'his_hotel_orderstatus_median', 'pos_hotel_price_mean', 'his_supplierid_ordroomnum_mean', 'his_supplierid_confirm-order_second_median', 'pos_supplierid_confirm-arrival_second_median', 'pos_hotel_commission_min', 'pos_hotel_price/ordroomnum_median', 'his_hotel_confirm-arrival_second_min', 'his_hotel_orderstatus_max', 'his_supplierid_(price-commission)/ordroomnum_min', 'his_room_orderstatus_mean', 'pos_masterhotelid_(price-commission)/ordroomnum_median', 'pos_hotel_price/ordroomnum_mean', 'his_hotel_confirm-arrival_day_var', 'nag_supplierid_confirm-order_second_var', 'nag_hotel_orderstatus_var', 'nag_supplierid_confirm-arrival_second_var', 'his_supplierid_price/ordroomnum_min', 'pos_hotel_confirm-order_second_var', 'pos_hotel_price_min', 'pos_supplierid_confirm-arrival_day_median', 'his_room_confirm-order_second_min', 'pos_hotel_price_max', 'pos_masterhotelid_price_mean', 'his_supplierid_price/ordroomnum_var', 'nag_supplierid_price_mean', 'nag_supplierid_commission/ordroomnum_median', 'pos_hotel_confirm-order_day_min', 'pos_hotel_orderstatus_max', 'his_hotel_confirm-order_day_min', 'his_supplierid_confirm-arrival_second_median', 'his_hotel_commission/ordroomnum_min', 'his_hotel_price_median', 'pos_hotel_commission/ordroomnum_min', 'nag_supplierid_price_median', 'pos_hotel_ordroomnum_max', 'his_room_orderstatus_median', 'nag_supplierid_confirm-order_second_mean', 'pos_hotel_(price-commission)/ordroomnum_min', 'pos_hotel_confirm-order_day_median', 'pos_hotel_confirm-arrival_second_var', 'his_supplierid_orderstatus_mean', 'pos_hotel_(price-commission)/ordroomnum_max', 'pos_hotel_confirm-arrival_day_min', 'his_supplierid_commission_mean', 'his_masterhotelid_price_median', 'nag_hotel_commission_var', 'pos_supplierid_price_var', 'his_supplierid_commission/ordroomnum_var', 'nag_supplierid_commission_var', 'pos_masterhotelid_price/ordroomnum_median', 'his_supplierid_confirm-arrival_day_median', 'his_supplierid_(price-commission)/ordroomnum_var', 'pos_supplierid_confirm-arrival_second_min', 'pos_hotel_price_var', 'pos_hotel_confirm-order_second_max', 'pos_supplierid_orderstatus_mean', 'pos_hotel_commission_var', 'nag_hotel_ordroomnum_min', 'pos_hotel_orderstatus_min', 'pos_supplierid_price_min', 'pos_hotel_confirm-order_day_max', 'nag_hotel_price_var', 'pos_hotel_price/ordroomnum_min', 'his_masterhotelid_(price-commission)/ordroomnum_mean', 'his_room_confirm-order_day_median', 'his_supplierid_confirm-arrival_second_min', 'his_room_confirm-arrival_day_median', 'pos_hotel_price/ordroomnum_var', 'pos_hotel_commission/ordroomnum_max', 'his_supplierid_commission_median', 'nag_supplierid_confirm-order_day_mean', 'pos_supplierid_(price-commission)/ordroomnum_max', 'his_masterhotelid_commission_min', 'his_masterhotelid_price/ordroomnum_median', 'pos_supplierid_confirm-order_second_var', 'nag_hotel_confirm-order_day_var', 'pos_hotel_commission/ordroomnum_mean', 'pos_supplierid_commission/ordroomnum_var', 'pos_supplierid_ordroomnum_mean', 'nag_hotel_(price-commission)/ordroomnum_mean', 'nag_supplierid_confirm-arrival_second_min', 'pos_hotel_orderstatus_var', 'pos_supplierid_(price-commission)/ordroomnum_min', 'nag_supplierid_price/ordroomnum_min', 'nag_hotel_price/ordroomnum_min', 'his_room_confirm-arrival_day_mean', 'nag_hotel_commission/ordroomnum_max', 'his_hotel_price/ordroomnum_max', 'his_hotel_(price-commission)/ordroomnum_min', 'his_masterhotelid_commission_mean', 'his_supplierid_price/ordroomnum_max', 'nag_hotel_ordroomnum_mean', 'his_supplierid_commission_var', 'nag_hotel_commission/ordroomnum_var', 'pos_supplierid_commission/ordroomnum_median', 'his_hotel_confirm-order_day_var', 'pos_supplierid_orderstatus_var', 'nag_supplierid_commission_max', 'nag_supplierid_confirm-arrival_day_var', 'pos_hotel_commission/ordroomnum_var', 'nag_hotel_(price-commission)/ordroomnum_median', 'pos_hotel_commission_median', 'his_masterhotelid_confirm-arrival_day_min', 'his_supplierid_price_min', 'nag_hotel_ordroomnum_max', 'pos_hotel_(price-commission)/ordroomnum_median', 'pos_supplierid_commission_median', 'his_supplierid_commission_max', 'his_room_confirm-order_second_median', 'nag_supplierid_ordroomnum_var', 'his_supplierid_confirm-order_day_mean', 'pos_supplierid_price/ordroomnum_min', 'his_supplierid_price_median', 'pos_supplierid_price_mean', 'his_hotel_price_min', 'pos_masterhotelid_price_median', 'pos_supplierid_commission_var', 'his_hotel_price/ordroomnum_median', 'pos_supplierid_ordroomnum_var', 'his_supplierid_confirm-arrival_second_mean', 'nag_hotel_(price-commission)/ordroomnum_max', 'his_hotel_commission_min', 'pos_hotel_ordroomnum_median', 'pos_hotel_commission_max', 'pos_city_price/ordroomnum_max', 'nag_hotel_price_median', 'pos_supplierid_commission/ordroomnum_mean', 'nag_hotel_price_min', 'his_masterhotelid_(price-commission)/ordroomnum_var', 'pos_hotel_confirm-arrival_second_max', 'nag_supplierid_confirm-order_day_var', 'nag_supplierid_confirm-arrival_day_mean', 'nag_hotel_(price-commission)/ordroomnum_var', 'pos_hotel_price/ordroomnum_max', 'his_supplierid_confirm-order_second_var', 'his_hotel_(price-commission)/ordroomnum_median', 'pos_supplierid_confirm-order_second_median', 'pos_hotel_ordroomnum_var', 'his_supplierid_commission_min', 'pos_supplierid_price/ordroomnum_max', 'nag_supplierid_confirm-arrival_second_mean', 'pos_supplierid_commission/ordroomnum_max', 'his_supplierid_price_var', 'pos_supplierid_confirm-arrival_day_mean', 'his_hotel_(price-commission)/ordroomnum_max', 'his_masterhotelid_price_mean', 'nag_hotel_confirm-arrival_second_var', 'pos_hotel_ordroomnum_min', 'nag_hotel_price_max', 'pos_masterhotelid_price/ordroomnum_var', 'nag_hotel_orderstatus_median', 'nag_masterhotelid_price/ordroomnum_median', 'his_supplierid_(price-commission)/ordroomnum_mean', 'his_hotel_price_max', 'nag_hotel_(price-commission)/ordroomnum_min', 'his_supplierid_confirm-order_day_median', 'nag_supplierid_(price-commission)/ordroomnum_max', 'pos_hotel_commission_mean', 'his_supplierid_price/ordroomnum_mean', 'pos_hotel_(price-commission)/ordroomnum_mean', 'pos_masterhotelid_commission_var', 'nag_hotel_confirm-arrival_day_var', 'his_supplierid_ordroomnum_var', 'nag_hotel_price/ordroomnum_max', 'his_hotel_orderstatus_var', 'pos_supplierid_confirm-order_day_mean', 'nag_hotel_ordroomnum_median', 'his_supplierid_confirm-arrival_day_mean', 'his_hotel_price/ordroomnum_mean', 'his_supplierid_ordroomnum_max', 'nag_city_(price-commission)/ordroomnum_median', 'his_hotel_commission_max', 'pos_masterhotelid_(price-commission)/ordroomnum_var', 'nag_city_confirm-arrival_day_min', 'his_hotel_confirm-arrival_day_max', 'his_supplierid_orderstatus_max', 'pos_hotel_ordroomnum_mean', 'nag_supplierid_ordroomnum_mean', 'nag_supplierid_commission_median', 'his_masterhotelid_commission/ordroomnum_var', 'pos_supplierid_commission_max', 'his_supplierid_commission/ordroomnum_median', 'pos_masterhotelid_commission/ordroomnum_var', 'his_hotel_confirm-order_second_max', 'nag_masterhotelid_commission_mean', 'nag_supplierid_price_var', 'nag_masterhotelid_price/ordroomnum_max', 'nag_supplierid_price/ordroomnum_max', 'pos_masterhotelid_price_max', 'pos_masterhotelid_confirm-arrival_day_min', 'pos_hotel_confirm-arrival_day_max', 'pos_hotel_(price-commission)/ordroomnum_var', 'his_room_confirm-order_day_mean', 'nag_hotel_confirm-order_day_mean', 'pos_supplierid_confirm-order_second_mean', 'nag_hotel_commission_max', 'nag_masterhotelid_(price-commission)/ordroomnum_median', 'pos_masterhotelid_commission/ordroomnum_min', 'pos_supplierid_confirm-arrival_day_min', 'his_masterhotelid_(price-commission)/ordroomnum_median', 'pos_room_orderstatus_max', 'nag_hotel_price/ordroomnum_mean', 'pos_supplierid_price/ordroomnum_var', 'pos_countryid_confirm-arrival_second_mean', 'nag_hotel_price/ordroomnum_var', 'his_masterhotelid_commission/ordroomnum_median', 'nag_supplierid_price_max', 'pos_supplierid_confirm-arrival_second_mean', 'his_masterhotelid_price/ordroomnum_mean', 'his_masterhotelid_price/ordroomnum_var', 'pos_supplierid_(price-commission)/ordroomnum_median', 'nag_city_commission/ordroomnum_median', 'his_room_confirm-order_day_min', 'his_hotel_commission_var', 'pos_masterbasicroomid_confirm-arrival_day_mean', 'pos_supplierid_confirm-arrival_second_var', 'nag_hotel_commission/ordroomnum_mean', 'his_city_commission/ordroomnum_max', 'his_supplierid_commission/ordroomnum_max', 'his_supplierid_price_max', 'nag_hotel_confirm-arrival_second_max', 'pos_countryid_confirm-order_second_min', 'pos_room_confirm-order_second_min', 'nag_masterhotelid_price/ordroomnum_mean', 'his_hotel_confirm-order_day_max', 'his_masterhotelid_commission/ordroomnum_min', 'pos_room_ordroomnum_min', 'his_supplierid_confirm-order_second_mean', 'pos_masterhotelid_commission/ordroomnum_mean', 'pos_masterhotelid_(price-commission)/ordroomnum_min', 'pos_hotel_confirm-order_day_var', 'pos_masterhotelid_price/ordroomnum_min', 'pos_masterhotelid_price_var', 'pos_masterhotelid_confirm-arrival_day_var', 'pos_supplierid_price_median', 'nag_masterhotelid_commission/ordroomnum_max', 'his_supplierid_price_mean', 'pos_masterhotelid_price/ordroomnum_mean', 'nag_hotel_confirm-order_day_min', 'nag_supplierid_orderstatus_var', 'nag_city_commission_var', 'nag_hotel_confirm-order_second_mean', 'pos_supplierid_confirm-order_second_min', 'his_supplierid_confirm-order_day_var', 'nag_masterhotelid_confirm-arrival_day_median', 'nag_supplierid_commission/ordroomnum_min', 'his_masterbasicroomid_price_var', 'pos_masterhotelid_commission_mean', 'his_hotel_(price-commission)/ordroomnum_var', 'pos_supplierid_confirm-order_day_var', 'pos_hotel_confirm-arrival_day_var', 'nag_hotel_confirm-order_day_max', 'his_masterhotelid_price/ordroomnum_max', 'his_supplierid_confirm-order_second_min', 'pos_supplierid_confirm-arrival_day_var', 'his_city_(price-commission)/ordroomnum_mean', 'nag_masterhotelid_price_mean', 'nag_countryid_commission_min', 'pos_city_ordroomnum_mean', 'nag_hotel_price/ordroomnum_median', 'his_masterhotelid_commission/ordroomnum_mean', 'his_supplierid_confirm-arrival_second_var', 'his_masterhotelid_(price-commission)/ordroomnum_min', 'his_masterhotelid_confirm-arrival_second_var', 'pos_supplierid_ordroomnum_max', 'pos_supplierid_(price-commission)/ordroomnum_var', 'nag_supplierid_commission/ordroomnum_mean', 'nag_hotel_ordroomnum_var', 'pos_supplierid_price/ordroomnum_median', 'his_masterhotelid_price/ordroomnum_min', 'his_hotel_commission_median', 'nag_supplierid_commission/ordroomnum_max', 'nag_masterhotelid_(price-commission)/ordroomnum_max', 'nag_hotel_commission_median', 'nag_masterhotelid_commission/ordroomnum_var', 'pos_supplierid_commission_min', 'his_room_ordroomnum_min', 'nag_hotel_confirm-order_second_median', 'pos_countryid_commission_min', 'nag_hotel_confirm-arrival_day_max', 'his_hotel_(price-commission)/ordroomnum_mean', 'pos_supplierid_orderstatus_max', 'pos_masterhotelid_price_min', 'nag_masterhotelid_commission_min', 'nag_supplierid_(price-commission)/ordroomnum_min', 'pos_masterhotelid_commission_median', 'nag_hotel_confirm-order_day_median', 'nag_hotel_price_mean', 'his_supplierid_confirm-arrival_day_max', 'his_room_confirm-arrival_second_median', 'his_masterbasicroomid_confirm-order_day_mean', 'his_city_commission/ordroomnum_var', 'nag_masterhotelid_commission_median', 'nag_masterhotelid_price_min', 'nag_city_confirm-order_second_min', 'nag_hotel_commission_mean', 'pos_masterhotelid_commission/ordroomnum_median', 'pos_city_commission/ordroomnum_max', 'his_supplierid_commission/ordroomnum_mean', 'nag_hotel_confirm-arrival_day_min', 'his_countryid_commission_var', 'nag_supplierid_ordroomnum_max', 'his_hotel_price/ordroomnum_min', 'his_hotel_commission/ordroomnum_var', 'pos_countryid_price/ordroomnum_var', 'his_room_confirm-order_second_mean', 'pos_room_orderstatus_median', 'nag_supplierid_confirm-order_day_min', 'his_masterhotelid_commission_median', 'pos_city_price_median', 'his_supplierid_confirm-arrival_day_min', 'nag_hotel_commission/ordroomnum_min', 'his_supplierid_orderstatus_var', 'pos_city_(price-commission)/ordroomnum_max', 'his_countryid_(price-commission)/ordroomnum_var']
    full, watch_cols = merge_new_fea(full, new_fea_path='./features/feature_zero3.csv', 
                                 usecols=['orderid'] + zero3_feas, merge_on_col=['orderid'], basic_cols=need_cols)

    full, watch_cols = merge_new_fea(full, new_fea_path='./features/feature_zero4.csv', 
                                 usecols=None, merge_on_col=['orderid'], basic_cols=need_cols)

    full, watch_cols = merge_new_fea(full, new_fea_path='./features/cjf_features.csv', 
                                 usecols=None, merge_on_col=['orderid'], basic_cols=need_cols)
    
    for col in full.dtypes[(full.dtypes == np.float64)].index:
        full[col] = full[col].astype(np.float32)
    gc.collect()

    for col in full.dtypes[(full.dtypes == np.int64)].index:
        if col != 'orderid':
            full[col] = full[col].astype(np.int32)
    gc.collect()

    return full, test_slice, test_a_slice, test_b_slice


# full = get_full([])

In [13]:
#  用CV筛的
basic_cols = ['orderid', 'nag_masterhotelid_ordadvanceday_mean_sub', 'orderdate_min', 'mroom_totalrooms', 'nag_masterhotelid_ordadvanceday_median_sub', 'pos_hotel_ordadvanceday_mean_sub', 'ordadvanceday', 'arrival_day', 'hotel_totalrooms', 'pos_masterbasicroomid_ordadvanceday_median_sub', 'hotelorder_maxdate_sub', 'masterbasicroomid', 'supplierid_crt_capacity', 'city_crt_capacity', 'room', 'pos_masterhotelid_supplierchannel_var', 'hotelorder_mindate_sub', 'orderdate_hour', 'pos_countryid_ordadvanceday_median_sub', 'pos_city_arrival_mon_mean_sub', 'masterhotelid', 'zq_num', 'pos_hotel_ordadvanceday_var', 'pos_masterhotelid_etd_mon_var', 'pos_hotel_ordadvanceday_median_sub', 'ord_cnt_in_30day_orderdate', 'room_hot', 'pos_hotel_ordadvanceday_max', 'pos_masterhotelid_ordadvanceday_mean', 'mhotel_include_masterbasicroomid_cnt', 'hotel_hot', 'pos_city_ordadvanceday_median_sub', 'pos_masterhotelid_orderdate_mon_mean_sub', 'glon', 'in_city_condi_supplierid', 'nag_city_hotelstar_mean', 'pos_hotel_orderdate_mon_var', 'etd_day', 'nag_masterhotelid_orderdate_mon_var', 'pos_masterhotelid_orderdate_mon_var', 'ord_cnt_in_60day_orderdate', 'pos_hotel_etd_mon_var', 'nag_hotel_ordadvanceday_median_sub', 'countryid_crt_capacity', 'masterhotelid_crt_capacity', 'in_city_condi_star', 'city_capacity_dist', 'pos_masterhotelid_supplierchannel_mean', 'nag_masterhotelid_ordadvanceday_mean', 'pos_masterhotelid_isebookinghtl_mean', 'nag_city_arrival_mon_var', 'hotel', 'arrival_dow', 'supplierid_capacity_dist', 'pos_masterhotelid_hotelbelongto_var', 'ord_cnt_in_1day_orderdate', 'pos_masterhotelid_hotelbelongto_mean_sub', 'pos_hotel_arrival_mon_var', 'in_city_condi_masterhotelid', 'nag_hotel_ordadvanceday_mean_sub', 'pos_masterhotelid_hotelbelongto_mean', 'glat', 'room_num', 'nag_masterhotelid_etd_mon_var', 'city', 'pos_hotel_etd_mon_mean_sub', 'pos_masterhotelid_ordadvanceday_max', 'in_countryid_condi_star', 'pos_city_hotelbelongto_mean', 'etd_dow']
watch_score = pickle.load(open('./features/watch_scores.pk', 'rb'))
len(basic_cols)
len(watch_score)

# 用后4天做验证集筛的
# basic_cols = ['orderdate_min', 'zone_ordadvanceday_bayes_rate_month', 'orderdate_hour', 'ordadvanceday_masterhotelid_bayes_rate_full', 'hotel_isholdroom_bayes_rate_month', 'ordadvanceday_masterbasicroomid_bayes_rate_full', 'orderid', 'city_ordadvanceday_bayes_rate_month', 'ordadvanceday_hotelstar_bayes_rate_month', 'ordadvanceday_supplierid_bayes_rate_full', 'nag_masterhotelid_ordadvanceday_mean_sub', 'zone_ordadvanceday_bayes_rate_full', 'ordadvanceday_supplierid_bayes_rate_month', 'countryid_supplierid_bayes_rate_month', 'city_ordadvanceday_bayes_rate_full', 'city_supplierid_bayes_rate_month', 'ordadvanceday_masterhotelid_bayes_rate_month', 'countryid_ordadvanceday_bayes_rate_month', 'ordadvanceday_isvendor_bayes_rate_month', 'isholdroom_supplierid_bayes_rate_month', 'zone_masterhotelid_bayes_rate_month', 'countryid_ordadvanceday_bayes_rate_full', 'masterbasicroomid_isvendor_bayes_rate_full', 'hotel_masterbasicroomid_bayes_rate_month', 'nag_masterhotelid_ordadvanceday_median_sub', 'city_masterbasicroomid_bayes_rate_full', 'masterbasicroomid_isvendor_bayes_rate_month', 'hotel_totalrooms', 'pos_hotel_ordadvanceday_mean_sub', 'ordadvanceday_isvendor_bayes_rate_full', 'ordadvanceday_hotelbelongto_bayes_rate_full', 'hotel_masterbasicroomid_bayes_rate_full', 'city_supplierid_bayes_rate_full', 'masterbasicroomid_supplierchannel_bayes_rate_month', 'masterbasicroomid_hotelbelongto_bayes_rate_full', 'city_masterbasicroomid_bayes_rate_month', 'ord_cnt_in_180day_etd', 'ord_cnt_in_1day_etd', 'isholdroom_masterhotelid_bayes_rate_month', 'room_masterhotelid_bayes_rate_full', 'masterbasicroomid_isebookinghtl_bayes_rate_month', 'zq_num', 'room', 'isholdroom_ordadvanceday_bayes_rate_month', 'nag_masterhotelid_orderdate_mon_var', 'city_crt_capacity', 'in_city_condi_masterhotelid', 'masterbasicroomid_isebookinghtl_bayes_rate_full', 'pos_countryid_ordadvanceday_median_sub', 'hotel_isholdroom_bayes_rate_full', 'hotel_room_bayes_rate_month', 'pos_hotel_ordadvanceday_median_sub', 'mhotel_include_masterbasicroomid_cnt', 'ordadvanceday_supplierchannel_bayes_rate_month', 'room_isholdroom_bayes_rate_full', 'ordadvanceday_isebookinghtl_bayes_rate_month', 'supplierid_crt_capacity', 'nag_masterhotelid_ordadvanceday_var', 'masterhotelid_isebookinghtl_bayes_rate_full', 'ordadvanceday_hotelstar_bayes_rate_full']
# watch_score = pickle.load(open('./data/watch_scores2.pk', 'rb'))

# need_cols = basic_cols + [item[0] for item in watch_score[:100]]
# print(len(need_cols))
# full, test_slice, test_a_slice, test_b_slice = get_full(need_cols)
# print(full.shape)

70

1998

In [5]:
fnl_feas_rank = pickle.load(open('./features/good_feas_top_1k_col_names.pk', 'rb'))
len(fnl_feas_rank)
target=['noroom']
by=['orderid', 'room', 'arrival']

df = pd.read_csv('./features/good_feas_top_1k.csv', usecols=list(set(fnl_feas_rank+by)))
df_target = pd.read_csv('./features/good_feas_top_1k_noroom.csv')
df_date = pd.read_csv('./features/feature_zero.csv', usecols=['orderid', 'orderdate'])
df.shape

1001

(331873, 1001)

In [6]:
df = df.merge(df_target, on=['orderid'], how='left')
df = df.merge(df_date, on=['orderid'], how='left')

In [6]:
import xgboost as xgb
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score

In [7]:
train = df.iloc[:-(11035+11036), :]
testa = df.iloc[-(11035+11036):-11036, :]
testb = df.iloc[-11036:, :]


train.shape
testa.shape

(309802, 1003)

(11035, 1003)

In [8]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    p_start = precision_score(labels, (preds>=0.05).astype(int))

    precision, recall, thresholds = precision_recall_curve(labels, preds)
    recall, precision = recall[::-1], precision[::-1]
    area = 0
    had_find_r_start = False
    for idx, item in enumerate(zip(recall, precision)):
        r, p = item[0], item[1]
        if r >= 0.5:
            area += precision[idx-1] * (0.5 - recall[idx-1])
            return 'prc', -area
        if r > 0.05:
            if not had_find_r_start:
                had_find_r_start = True
                area += p_start * (r - 0.05)
            else:
                area += precision[idx-1] * (r - recall[idx-1])
    return 'prc', -area

In [25]:
%%time
# 40 90
params = {
    'objective': 'binary:logistic',
    'eta': 0.1,
    'colsample_bytree': 0.7,
    'min_child_weight': 5,
    'max_depth': 7,
    'max_delta_step': 3,
    'subsample': 0.7,
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
    'verbose_eval': True,
    'seed': 2018,
    'missing': -1
}
xgbtrain = xgb.DMatrix(train[fnl_feas_rank[:80]], train[target])

cv = xgb.cv(
params,
dtrain=xgbtrain,
num_boost_round=20000,
nfold=3,
feval=evalerror,
verbose_eval=100,
early_stopping_rounds=10,
seed=2018)
print(str(cv.shape[0]) + ':' + str(cv.iloc[-1, :]['test-prc-mean']))

[0]	train-error:0.0408957+0.000324781	train-prc:-0.251864+0.00548726	test-error:0.0430143+0.000612819	test-prc:-0.219414+0.00520646
[100]	train-error:0.0359147+0.000340844	train-prc:-0.359677+0.00573437	test-error:0.0406037+0.000850319	test-prc:-0.269409+0.00395651


KeyboardInterrupt: 

In [10]:
# train
df['orderdate'] = pd.to_datetime(df['orderdate'], format='%Y-%m-%d %H:%M')
df['orderdate_day'] = df['orderdate'].apply(lambda x:x.day)
train_flag = ((df['orderdate_day'] <= 10)).values
val_flag = ((df['orderdate_day'] <= 14) & (df['orderdate_day'] > 10)).values
print(sum(train_flag), sum(val_flag))

216575 93227


In [18]:
%%time
# 40 90
params = {
    'objective': 'binary:logistic',
    'eta': 0.07,
    'subsample': 0.886,
    'colsample_bytree': 0.5,
    'min_child_weight': 5,
    'max_depth': 5,
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
    'verbose_eval': True,
    'seed': 2018,
    'missing': -1
}
# xgbtrain = xgb.DMatrix(df[train_flag][fnl_feas_rank[:80]], df[train_flag][target])
# xgbval = xgb.DMatrix(df[val_flag][fnl_feas_rank[:80]], df[val_flag][target])

model = xgb.train(params, xgbtrain, num_boost_round=20000, early_stopping_rounds=100, evals=[(xgbtrain, 'train'),(xgbval, 'val')],feval=evalerror, verbose_eval=100)
_ = gc.collect()

[0]	train-error:0.04199	val-error:0.044837	train-prc:-0.185527	val-prc:-0.193815
Multiple eval metrics have been passed: 'val-prc' will be used for early stopping.

Will train until val-prc hasn't improved in 100 rounds.
[100]	train-error:0.039266	val-error:0.042981	train-prc:-0.284392	val-prc:-0.254405
[200]	train-error:0.037807	val-error:0.042885	train-prc:-0.316254	val-prc:-0.256891
Stopping. Best iteration:
[175]	train-error:0.038222	val-error:0.042981	train-prc:-0.308039	val-prc:-0.257438

Wall time: 1min 25s


In [None]:
[0]	train-error:0.0312973+0.000527394	train-prc:-0.327072+0.0158372	test-error:0.0438217+0.000637141	test-prc:-0.214824+0.00257921
[100]	train-error:0.010723+0.00103828	train-prc:-0.449883+6.13967e-05	test-error:0.0398997+0.000625867	test-prc:-0.281186+0.002374
65:-0.284415666667

In [13]:
xgbtrain = xgb.DMatrix(train[fnl_feas_rank[:300]], train[target])
xgbtest = xgb.DMatrix(testa[fnl_feas_rank[:300]], train[target])
model = xgb.train(params, xgbtrain, num_boost_round=int(cv.shape[0]*1.2))
pred = model.predict(xgbtest)

In [19]:
submit = testa[['orderid', 'room', 'arrival']].copy()
submit['noroom'] = pred
submit['arrival'] = submit['arrival'].apply(lambda x:x.replace('/', '-'))
submit['arrival'] = pd.to_datetime(submit['arrival'], format='%Y-%m-%d')
submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values
submit = submit.iloc[:11035, :]
submit.to_csv('./submit/xgb_zero_300.csv', index=None)

In [20]:
submit.shape
submit.head()

(11035, 4)

Unnamed: 0,orderid,room,arrival,noroom
309802,4505896718,71340604,2017/10/11,0.029826
309803,4504466510,70578904,2017/09/19,0.007547
309804,4505791849,91096440,2017/09/16,0.012802
309805,4505284465,20487676,2017/09/17,0.011303
309806,4506807614,88163053,2017/09/17,0.001012


## 50+50bagging（加入）

In [26]:
import random
params = {
    'objective': 'binary:logistic',
    'eta': 0.07,
    'subsample': 0.886,
    'colsample_bytree': 0.5,
    'min_child_weight': 5,
    'max_depth': 5,
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
    'verbose_eval': True,
    'seed': 2018,
    'missing': -1
}

for i in tqdm_notebook(range(9, 10)):
    predictors = fnl_feas_rank[:80].copy()
    predictors += fnl_feas_rank[100+i*20:100+(i+1)*20]
    assert(len(predictors) == 100)
    
    xgbtrain = xgb.DMatrix(df[train_flag][predictors], df[train_flag][target])
    xgbval = xgb.DMatrix(df[val_flag][predictors], df[val_flag][target])
    model = xgb.train(params, xgbtrain, num_boost_round=20000, early_stopping_rounds=100, evals=[(xgbtrain, 'train'),(xgbval, 'val')],feval=evalerror, verbose_eval=None)
    print(str(i) + '->' + str(model.best_iteration) + ':' + str(model.best_score))
    
    xgbtrain = xgb.DMatrix(train[predictors], train[target])
    xgbtesta = xgb.DMatrix(testa[predictors])
    xgbtestb = xgb.DMatrix(testb[predictors])
    model = xgb.train(params, xgbtrain, num_boost_round=int(model.best_iteration*1.1))
    
    pred = model.predict(xgbtesta)
    submit = testa[['orderid', 'room', 'arrival']].copy()
    submit['noroom'] = pred
    submit['arrival'] = submit['arrival'].apply(lambda x:x.replace('/', '-'))
    submit['arrival'] = pd.to_datetime(submit['arrival'], format='%Y-%m-%d')
    submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values
    submit.to_csv('./submit3/xgb_zero_' + str((i+1)*50) +'.csv', index=None)
    
    pred = model.predict(xgbtestb)
    submit = testb[['orderid', 'room', 'arrival']].copy()
    submit['noroom'] = pred
    submit['arrival'] = submit['arrival'].apply(lambda x:x.replace('/', '-'))
    submit['arrival'] = pd.to_datetime(submit['arrival'], format='%Y-%m-%d')
    submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values
    submit.to_csv('./submit3/b/xgb_zero_b_' + str((i+1)*50) +'.csv', index=None)
    
    del xgbtrain
    del xgbval
    del xgbtesta
    del xgbtestb
    _ = gc.collect()

A Jupyter Widget




Exception in thread Thread-8:
Traceback (most recent call last):
  File "C:\Users\ZERO\Anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "C:\Users\ZERO\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "C:\Users\ZERO\Anaconda3\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



0->248:-0.257497
1->227:-0.257364
2->247:-0.246615
3->284:-0.25621
4->236:-0.255557
5->243:-0.257063
6->265:-0.255441
7->179:-0.255998
8->138:-0.255262
9->240:-0.252842
10->223:-0.254393
11->312:-0.257455
12->260:-0.257065
13->252:-0.255035



KeyboardInterrupt: 

In [None]:
# 4 9 6 7 10
0->69:-0.278026666667
1->71:-0.282779333333
2->62:-0.283352333333
3->66:-0.284033333333
# 4->61:-0.285838333333
5->65:-0.284415666667
6->58:-0.285665333333
7->59:-0.285458333333
8->65:-0.284797333333
# 9->71:-0.285315
10->53:-0.285293333333
11->69:-0.284094333333
12->67:-0.284152333333
13->61:-0.283492333333
14->54:-0.284521333333
15->54:-0.284003
16->62:-0.284301333333

In [44]:
submit = testa[['orderid', 'room', 'arrival']].copy()
submit['arrival'] = submit['arrival'].apply(lambda x:x.replace('/', '-'))
submit['arrival'] = pd.to_datetime(submit['arrival'], format='%Y-%m-%d')
submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values

In [45]:
import os
submit['noroom'] = 0
best5 = np.array([4, 9, 6, 7, 10])
best5 = list((best5 + 1) * 50)
ep = 0
for i, file in enumerate(os.listdir('./submit/')):
    tmp = pd.read_csv('./submit/'+file)
    try:
        num = int(file[file.rfind('_')+1:file.rfind('.')])
    except:
        num = -99
    if num in best5:
        tmp = tmp.rename(index=str, columns={'noroom':'noroom'+str(ep)})
        submit = submit.merge(tmp, on=['orderid', 'room', 'arrival'], how='left')
        submit['noroom'] = submit['noroom'] + submit['noroom'+str(ep)]
        ep += 1
submit['noroom'] = submit['noroom']/ep
submit.to_csv('./submit/xgb_zero_bagging_mixbest5.csv', index=None)

In [46]:
submit.head()

Unnamed: 0,orderid,room,arrival,noroom,noroom0,noroom1,noroom2,noroom3,noroom4
0,4505896718,71340604,2017/10/11,0.020495,0.022783,0.022371,0.023358,0.012838,0.021123
1,4504466510,70578904,2017/09/19,0.009778,0.008421,0.0081,0.010658,0.008826,0.012885
2,4505791849,91096440,2017/09/16,0.01608,0.013259,0.020739,0.017229,0.008727,0.020445
3,4505284465,20487676,2017/09/17,0.01062,0.00982,0.012803,0.012748,0.00771,0.01002
4,4506807614,88163053,2017/09/17,0.001065,0.00105,0.00114,0.001047,0.000599,0.001491


## 20+20bagging（不加入）

In [7]:
import random
params = {
    'objective': 'binary:logistic',
    'eta': 0.1,
    'colsample_bytree': 0.886,
    'min_child_weight': 0.1,
    'max_depth': 15,
    'subsample': 0.886,
    'gamma': 0.1,
    'lambda': 10,
    'verbose_eval': True,
    'seed': 2018,
    'missing': -1
}

for i in tqdm_notebook(range(27, 45)):
    predictors = random.sample(fnl_feas_rank[:100], 80).copy()
    predictors += fnl_feas_rank[100+i*20:100+(i+1)*20]
    assert(len(predictors) == 100)
    xgbtrain = xgb.DMatrix(train[predictors], train[target])

    cv = xgb.cv(
        params,
        dtrain=xgbtrain,
        num_boost_round=20000,
        nfold=3,
        feval=evalerror,
        verbose_eval=None,
        early_stopping_rounds=50,
        seed=2018)
    print(str(i) + '->' + str(cv.shape[0]) + ':' + str(cv.iloc[-1, :]['test-prc-mean']))
    
    xgbtesta = xgb.DMatrix(testa[predictors], train[target])
    xgbtestb = xgb.DMatrix(testb[predictors], train[target])
    model = xgb.train(params, xgbtrain, num_boost_round=int(cv.shape[0]*1.2))
    
    pred = model.predict(xgbtesta)
    submit = testa[['orderid', 'room', 'arrival']].copy()
    submit['noroom'] = pred
    submit['arrival'] = submit['arrival'].apply(lambda x:x.replace('/', '-'))
    submit['arrival'] = pd.to_datetime(submit['arrival'], format='%Y-%m-%d')
    submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values
    submit.to_csv('./submit2/xgb_zero_' + str(i) +'.csv', index=None)
    
    pred = model.predict(xgbtestb)
    submit = testb[['orderid', 'room', 'arrival']].copy()
    submit['noroom'] = pred
    submit['arrival'] = submit['arrival'].apply(lambda x:x.replace('/', '-'))
    submit['arrival'] = pd.to_datetime(submit['arrival'], format='%Y-%m-%d')
    submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values
    submit.to_csv('./submit2/b/xgb_zero_' + str(i) +'.csv', index=None)
    
    del xgbtrain
    del xgbtesta
    del xgbtestb
    _ = gc.collect()

A Jupyter Widget

27->69:-0.282221333333
28->70:-0.283793333333
29->78:-0.281714333333
30->69:-0.282586333333
31->76:-0.283981
32->62:-0.283175666667
33->73:-0.283037
34->65:-0.284102666667
35->74:-0.283601666667
36->71:-0.282691
37->81:-0.281868
38->66:-0.283131666667
39->75:-0.283749
40->79:-0.283370333333
41->67:-0.281284333333
42->80:-0.283337
43->81:-0.282945
44->70:-0.283956



In [25]:
submit.head()

Unnamed: 0,orderid,room,arrival,noroom
320837,4506801369,78296910,2017/09/18,0.026815
320838,4505927189,59788102,2017/10/25,0.002086
320839,4504658752,8536026,2017/09/29,0.008889
320840,4507622862,30835114,2017/10/17,0.001104
320841,4505949633,6133625,2017/09/15,0.152522


In [None]:
0->72:-0.282768333333
1->71:-0.282342333333
2->72:-0.282818666667
# 3->74:-0.284074333333
4->71:-0.283793
5->70:-0.280703333333
6->75:-0.282697666667
7->79:-0.284014333333
8->72:-0.280882666667
9->72:-0.283534
10->77:-0.283573
11->66:-0.281700666667
12->75:-0.282912666667
13->65:-0.283534666667
14->76:-0.283037333333
15->81:-0.282668333333
16->80:-0.283283
17->65:-0.281809666667
18->69:-0.281431
19->70:-0.283387333333
20->68:-0.283017666667
21->67:-0.282831
22->78:-0.283064666667
# 23->71:-0.284327333333
24->68:-0.283595333333
25->72:-0.281869
26->72:-0.282669333333
27->69:-0.282221333333
28->70:-0.283793333333
29->78:-0.281714333333
30->69:-0.282586333333
31->76:-0.283981
32->62:-0.283175666667
33->73:-0.283037
# 34->65:-0.284102666667
35->74:-0.283601666667
36->71:-0.282691
37->81:-0.281868
38->66:-0.283131666667
39->75:-0.283749
40->79:-0.283370333333
41->67:-0.281284333333
42->80:-0.283337
43->81:-0.282945
44->70:-0.283956

In [38]:
submit = testa[['orderid', 'room', 'arrival']].copy()
submit['arrival'] = submit['arrival'].apply(lambda x:x.replace('/', '-'))
submit['arrival'] = pd.to_datetime(submit['arrival'], format='%Y-%m-%d')
submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values

In [35]:
tmp.head()

Unnamed: 0,orderid,room,arrival,noroom8
0,4505896718,71340604,2017/10/11,0.06065
1,4504466510,70578904,2017/09/19,0.007462
2,4505791849,91096440,2017/09/16,0.021648
3,4505284465,20487676,2017/09/17,0.012541
4,4506807614,88163053,2017/09/17,0.000443


In [39]:
import os
submit['noroom'] = 0
best = np.array([3, 23, 34])
total = list(range(45))
ep = 0
for i, file in enumerate(os.listdir('./good/')):
    if 'xgb_zero' in file:
        tmp = pd.read_csv('./good/'+file)
        try:
            num = int(file[file.rfind('_')+1:file.rfind('.')])
        except:
            num = -99
        tmp = tmp.rename(index=str, columns={'noroom':'noroom'+str(ep)})
        submit = submit.merge(tmp, on=['orderid', 'room', 'arrival'], how='left')
        submit['noroom'] = submit['noroom'] + submit['noroom'+str(ep)]
        ep += 1
submit['noroom'] = submit['noroom']/ep
print(ep)
submit[['orderid', 'room', 'arrival', 'noroom']].to_csv('./good/xgb_zero_bagging.csv', index=None)

9


In [40]:
submit.head()

Unnamed: 0,orderid,room,arrival,noroom,noroom0,noroom1,noroom2,noroom3,noroom4,noroom5,noroom6,noroom7,noroom8
0,4505896718,71340604,2017/10/11,0.05695,0.041865,0.077954,0.049571,0.083317,0.046762,0.05771,0.043655,0.051064,0.06065
1,4504466510,70578904,2017/09/19,0.009885,0.010073,0.013876,0.009158,0.00767,0.009829,0.010224,0.010355,0.010322,0.007462
2,4505791849,91096440,2017/09/16,0.02877,0.041624,0.021385,0.019237,0.020565,0.021333,0.048699,0.018249,0.046191,0.021648
3,4505284465,20487676,2017/09/17,0.016694,0.01504,0.024888,0.01972,0.023388,0.013021,0.013533,0.012072,0.016048,0.012541
4,4506807614,88163053,2017/09/17,0.000857,0.001456,0.000957,0.000642,0.000455,0.001175,0.000973,0.000433,0.001177,0.000443
