In [3]:
import numpy as np
import pandas as pd
# import seaborn as sns
import os  
import sys 
import gc
import pickle
import matplotlib.pyplot as plt
# import lightgbm as lgb
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score

%matplotlib inline

In [4]:
def read_df_float32(file_path, read_cols=[], sep=',', nrows=None, exclude_cols=[], skip_nrows=None,
                    header='infer', verbose=False):
    assert ((len(read_cols) == 0) | (len(exclude_cols) == 0))

    if verbose:
        print('reading', file_path)
    if len(read_cols) > 0:
        data = pd.read_csv(file_path, nrows=1, sep=sep, usecols=read_cols)
    else:
        data = pd.read_csv(file_path, nrows=1, sep=sep)

    read_type_dict = {}
    for col in data.columns[data.dtypes == np.float64]:
        read_type_dict[col] = np.float32
    for col in data.columns[data.dtypes == np.int64]:
        read_type_dict[col] = np.int32

    if len(read_cols) > 0:
        data = pd.read_csv(file_path, sep=sep, usecols=read_cols, nrows=nrows, skiprows=skip_nrows,
                           dtype=read_type_dict,
                           header=header)
    elif len(exclude_cols) > 0:
        read_cols = filter(lambda x: x not in exclude_cols, data.columns)
        data = pd.read_csv(file_path, sep=sep, usecols=read_cols, nrows=nrows, skiprows=skip_nrows,
                           dtype=read_type_dict,
                           header=header)
    else:
        data = pd.read_csv(file_path, sep=sep, nrows=nrows, skiprows=skip_nrows, dtype=read_type_dict, header=header)
    return data

def save_float32(data, save_path):
    for col in data.select_dtypes(include=[np.float64]).columns:
        data[col] = data[col].astype(np.float32)
    for col in data.select_dtypes(include=[np.int32]).columns:
        data[col] = data[col].astype(np.int32)
    data.to_csv(save_path, index=False)
    
def add_data_index_col(data):
    data.reset_index(drop=False, inplace=True)
    data = data.rename({'index': 'data_index'}, axis=1)
    return data

def get_fea_imp(bst):
    fea_imp = tuple(zip(bst.feature_name(), bst.feature_importance()))
    return sorted(fea_imp, key=lambda x:x[1], reverse=True)

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    p_start = precision_score(labels, (preds>=0.05).astype(int))

    precision, recall, thresholds = precision_recall_curve(labels, preds)
    recall, precision = recall[::-1], precision[::-1]
    area = 0
    had_find_r_start = False
    #     上一个点的 P
    for idx, item in enumerate(zip(recall, precision)):
        r, p = item[0], item[1]
        if r >= 0.5:
            area += precision[idx-1] * (0.5 - recall[idx-1])
            return 'prc', area, True
        if r > 0.05:
            if not had_find_r_start:
                had_find_r_start = True
                area += p_start * (r - 0.05)
            else:
                area += precision[idx-1] * (r - recall[idx-1])
    return 'prc', area, True

In [5]:
# 0    3817353
# 1     205733
# +/all = 0.051138106418803876
ord_train = pd.read_csv('./data/train/ord_train.csv')
ord_train.drop(['commission', 'confirmdate', 'orderstatus', 'ordroomnum', 'price'], axis=1, inplace=True)
print(ord_train.shape)
print(ord_train.isnull().sum())

(4023086, 20)
orderid                   0
orderdate                 0
city                      0
countryid                 0
hotel                     0
zone                 779471
room                      0
isholdroom                0
arrival                   0
etd                       0
ordadvanceday             0
noroom                    0
masterbasicroomid      3037
masterhotelid             0
supplierid               16
isvendor                  0
hotelbelongto             0
isebookinghtl             0
hotelstar                 0
supplierchannel           0
dtype: int64


In [6]:
ord_train.head()

Unnamed: 0,orderid,orderdate,city,countryid,hotel,zone,room,isholdroom,arrival,etd,ordadvanceday,noroom,masterbasicroomid,masterhotelid,supplierid,isvendor,hotelbelongto,isebookinghtl,hotelstar,supplierchannel
0,3678052984,2017-05-15 10:03:34,1777,111,6459206,3302.0,62786796,F,2017-06-23,2017-06-25,40,0,21087625.0,713478,1897.0,0,SHT,T,5,合作
1,3623712471,2017-05-02 21:25:37,622,4,4515821,,51183791,F,2017-05-05,2017-05-06,3,0,30110898.0,1210465,2322.0,1,SHT,T,4,合作
2,3935430746,2017-06-20 15:03:39,366,30,3403530,,18594989,F,2017-07-01,2017-07-04,12,0,33727477.0,741340,3686.0,1,SHT,F,5,合作
3,4095204178,2017-07-19 23:45:12,274,42,4984070,705.0,72423477,T,2017-08-22,2017-08-24,34,0,52394065.0,5033399,2299.0,0,HTL,T,5,直签
4,3753007353,2017-05-26 16:21:36,723,108,3796626,340.0,55656893,F,2017-05-27,2017-05-28,1,0,6132999.0,2144683,2391.0,0,SHT,T,5,合作


In [7]:
ord_test = pd.read_csv('./data/test/ord_testA.csv', encoding = "GB2312")
print(ord_test.shape)
print(ord_test.isnull().sum())
ord_test['arrival'] = ord_test['arrival'].apply(lambda x:x.replace('/', '-'))
ord_test['orderdate'] = ord_test['orderdate'].apply(lambda x:x.replace('/', '-'))
ord_test['etd'] = ord_test['etd'].apply(lambda x:x.replace('/', '-'))

(11035, 19)
orderid                 0
orderdate               0
city                    0
countryid               0
hotel                   0
zone                 2504
room                    0
isholdroom              0
arrival                 0
etd                     0
ordadvanceday           0
masterbasicroomid       0
masterhotelid           0
supplierid              0
isvendor                0
hotelbelongto           0
isebookinghtl           0
hotelstar               0
supplierchannel         0
dtype: int64


In [8]:
full = pd.concat([ord_train, ord_test])
del (ord_train, ord_test)
print(full.shape)

full['orderdate'] = pd.to_datetime(full['orderdate'], format='%Y-%m-%d %H:%M')
full['arrival'] = pd.to_datetime(full['arrival'], format='%Y-%m-%d %H:%M')
full['etd'] = pd.to_datetime(full['etd'], format='%Y-%m-%d %H:%M')

full['orderdate_mon'] = full['orderdate'].apply(lambda x:x.month)
full['arrival_mon'] = full['arrival'].apply(lambda x:x.month)
full['etd_mon'] = full['etd'].apply(lambda x:x.month)

full['orderdate_day'] = full['orderdate'].apply(lambda x:x.day)
full['arrival_day'] = full['arrival'].apply(lambda x:x.day)
full['etd_day'] = full['etd'].apply(lambda x:x.day)

(4034121, 20)


In [7]:
# # masterbasicroomid 缺失，通过 masterhotelid 的众数补
# # nan_key_masterhotelid = full[full['masterbasicroomid'].isnull()]['masterhotelid'].unique()
# mroom_info = pd.read_csv('../data/train/mroominfo.csv')
# tmp = mroom_info[mroom_info['masterhotelid'].isin(nan_key_masterhotelid)].groupby('masterhotelid').apply(
#     lambda x:x['masterbasicroomid'].value_counts().index[0])
# full['masterbasicroomid'] = full[['masterhotelid', 'masterbasicroomid']].apply(
#     lambda x: tmp[x['masterhotelid']] if np.isnan(x['masterbasicroomid']) else x, axis=1)

In [6]:
object_cols = ['hotelbelongto', 'isebookinghtl', 'isholdroom', 'supplierchannel']

from sklearn.preprocessing import LabelEncoder
for col in object_cols:
    le = LabelEncoder()
    le.fit(full[col])
    print(le.classes_, le.transform(le.classes_))
    full[col] = le.transform(full[col])

['HPP' 'HTL' 'PKG' 'SHT'] [0 1 2 3]
['F' 'T'] [0 1]
['F' 'T'] [0 1]
['Agoda' 'BOOKING' 'Expedia' '合作' '直签' '集团直连'] [0 1 2 3 4 5]


In [None]:
#  'orderid',
    
# 'arrival',
#  'etd',
 
#  'city',
#  'countryid',
#  'zone'
    
#  'hotel',
#  'hotelbelongto',
#  'hotelstar',
#  'isebookinghtl',
#  'isholdroom',
#  'isvendor',
#  'masterbasicroomid',
#  'masterhotelid',
#   'room',
#  'supplierchannel',
#  'supplierid',
# + 'totalrooms'(hotelinfo), 'glon', 'glat', 'star', 'totalrooms'(mroominfo),

    
#  'ordadvanceday',
#  'orderdate',

In [7]:
hist = full[(full['orderdate_mon'] < 8)]
full = full[(full['orderdate_mon'] >= 8)]
print(hist.shape, full.shape)

(3055389, 26) (978732, 26)


### Feature

In [8]:
# train_flag = ((full['orderdate_mon'] == 9) & (full['orderdate_day'] < 14)).values
train_flag = (full['orderdate_mon'] == 8).values
val_flag = ((full['orderdate_mon'] == 9) & (full['orderdate_day'] <= 14)).values
test_flag = ((full['orderdate_mon'] == 9) & (full['orderdate_day'] > 14)).values
print(sum(train_flag), sum(val_flag), sum(test_flag))

657895 309802 11035


In [9]:
new_fea = pd.read_csv('../data/train/hotelinfo.csv', usecols=['hotel', 'totalrooms'])
new_fea.rename({'totalrooms':'hotel_totalrooms'}, axis=1, inplace=True)
full = full.merge(new_fea, on='hotel', how='left')
print(full.shape)

(978732, 27)


In [10]:
new_fea = pd.read_csv('../data/train/mhotelinfo.csv', usecols=['masterhotelid', 'glon', 'glat', 'star'])
full = full.merge(new_fea, on='masterhotelid', how='left')
print(full.shape)

(978732, 30)


In [12]:
# # 因为 masterbasicroomid 缺失，暂时先不 merge
# new_fea = pd.read_csv('../data/train/mroominfo.csv', usecols=['masterbasicroomid', 'totalrooms'])
# new_fea.rename({'totalrooms':'mroom_totalrooms'}, axis=1, inplace=True)
# full = full.merge(new_fea, on='masterbasicroomid', how='left')
# print(full.shape)

(321229, 36)


### Train

In [None]:
def train_lgb(train_data, val_data=None, num_boost_round=1000):
    print('train shape:', train_data.data.shape)
    if val_data:
        assert train_data.data.shape[1] == val_data.data.shape[1] 
        print('val shape:', val_data.data.shape)
    cv_params =  {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {evalerror},
        'is_training_metric': False,
        'min_data_in_leaf': 12,
        'num_leaves': 32,
        'learning_rate': 0.07,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        }
    if val_data:
        bst = lgb.train(cv_params, train_data, early_stopping_rounds=25, verbose_eval=200, 
                        valid_sets=[train_data, val_data], valid_names=['train', 'val'], 
                        num_boost_round=num_boost_round, feval=evalerror)
    else:
        bst = lgb.train(cv_params, train_data, early_stopping_rounds=25, verbose_eval=200, 
                        valid_sets=[train_data], valid_names=['train'], 
                        num_boost_round=num_boost_round, feval=evalerror)
    return bst

In [11]:
use_cols = ['city', 'countryid', 'hotel', 'hotelbelongto', 'hotelstar',
       'isebookinghtl', 'isholdroom', 'isvendor', 'masterbasicroomid',
       'masterhotelid', 'ordadvanceday', 'orderid', 'room',
       'supplierchannel', 'supplierid', 'zone', 'orderdate_mon', 'arrival_mon',
       'etd_mon', 'orderdate_day', 'arrival_day', 'etd_day',
       'hotel_totalrooms', 'glon', 'glat', 'star']
len(use_cols)

26

In [12]:
train_data = lgb.Dataset(full[train_flag][use_cols], full[train_flag]['noroom'])
val_data = lgb.Dataset(full[val_flag][use_cols], full[val_flag]['noroom'])
print('train shape:', train_data.data.shape)
print('val shape:', val_data.data.shape)
cv_params =  {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {evalerror},
        'is_training_metric': False,
        'min_data_in_leaf': 30,
        'num_leaves': 150,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.6,
        }
bst = lgb.train(cv_params, train_data, early_stopping_rounds=25, verbose_eval=50, 
                        valid_sets=[train_data, val_data], valid_names=['train', 'val'], 
                        num_boost_round=1000, feval=evalerror)

train shape: (657895, 26)
val shape: (309802, 26)
Training until validation scores don't improve for 25 rounds.
[50]	train's prc: 0.255529	val's prc: 0.204403
[100]	train's prc: 0.278415	val's prc: 0.20939
[150]	train's prc: 0.291795	val's prc: 0.210059
Early stopping, best iteration is:
[139]	train's prc: 0.288854	val's prc: 0.210351


In [None]:
# 150:  [509]	train's prc: 0.34196	val's prc: 0.250873
[139]	train's prc: 0.288854	val's prc: 0.210351

### Predict

In [13]:
pred = bst.predict(full[test_flag][use_cols])

In [14]:
submit = full[test_flag][['orderid', 'room', 'arrival']]
submit['noroom'] = pred
submit['arrival'] = submit['arrival'].dt.strftime('%Y/%m/%d').values
submit.head()

Unnamed: 0,orderid,room,arrival,noroom
967697,4505896718,71340604,2017/10/11,0.04026
967698,4504466510,70578904,2017/09/19,0.023085
967699,4505791849,91096440,2017/09/16,0.056268
967700,4505284465,20487676,2017/09/17,0.021613
967701,4506807614,88163053,2017/09/17,0.005694


In [15]:
submit.to_csv('../submit/lgb_1m_train_bl_9v.csv', index=False)

In [16]:
submit.shape

(11035, 4)

In [None]:
lgb_1m_train_bl: online: 0.25348411, local:200 [92] train's prc: 0.372813	val's prc: 0.254069
lgb_7m_train_bl: online: 0.25786676, local:150 [509] train's prc: 0.34196	val's prc: 0.250873
以 14号 为验证集趋势不符

尝试以9月为val
lgb_7m_train_bl_9v: online:0.23570004, local:[615]	train's prc: 0.308787	val's prc: 0.216412, 
lgb_1m_train_bl_9v: online:0.23682006, local: [139]	train's prc: 0.288854	val's prc: 0.210351

