In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.decomposition import PCA
import datetime,time

In [2]:
def preprocess_x(x_data,is_train=True):
    # 丟掉 canceled 且 no deposit 的資料
    if(is_train):
        x_data.drop(x_data[(x_data['is_canceled']==1)&(x_data['deposit_type']=='No Deposit')].index,inplace=True)
    x_data = combine_arrival_date(x_data)
    if(is_train):
        x_data['date_difference'] = (pd.to_datetime(x_data['reservation_status_date']) - pd.to_datetime(x_data['arrival_date']) + datetime.timedelta(days=1))
    x_data['stays_total_nights'] = x_data['stays_in_week_nights'] + x_data['stays_in_weekend_nights']
    if(is_train):
        x_data['booking_total_revenue'] = np.where(x_data['is_canceled']==0,(x_data['stays_total_nights']+1)*x_data['adr'],x_data['adr'])
    else:
        x_data['booking_total_revenue'] = (x_data['stays_total_nights']+1)*x_data['adr']
    train_booking_total_tmp = x_data.groupby('arrival_date').sum()
    train_booking_total_tmp['arrival_date'] = train_booking_total_tmp.index
    train_booking_total = train_booking_total_tmp[['arrival_date','booking_total_revenue']]
    x = train_booking_total.set_index('arrival_date').values
#     y = y_data.set_index('arrival_date').values
#     y = np.reshape(y,(640,))
    return x,x_data
def preprocess_y(y_data):
    y = y_data.set_index('arrival_date').values
    y = np.reshape(y,(640,))
    return y

In [3]:
def combine_arrival_date(x_data):
    #將arrival_date做出來
    day_to_str = {}
    for i in range(1,32):
        if i<10:
            day_to_str[str(i)]='0'+str(i)
        else:
            day_to_str[str(i)]=str(i)
    x_data['arrival_date_year'] = x_data['arrival_date_year'].astype(str)
    x_data['arrival_date_month'] = x_data['arrival_date_month'].map({'January':'01','February':'02','March':'03','April':'04','May':'05','June':'06','July':'07','August':'08','September':'09','October':'10','November':'11','December':'12'})
    x_data['arrival_date_day_of_month'] = x_data['arrival_date_day_of_month'].astype(str).map(day_to_str)
    x_data['arrival_date'] = x_data['arrival_date_year'] + '-' + x_data['arrival_date_month'] + '-' + x_data['arrival_date_day_of_month']
    # x_data.drop(['arrival_date_year','arrival_date_day_of_month'],axis=1,inplace=True)
    return x_data

In [4]:
# function
def pre_adr_model(train,test):
    x,booking_total = preprocess_x(train)
    test = combine_arrival_date(test)
    # 丟掉一些沒用到的 features
    booking_total.drop(['country','company','is_canceled','reservation_status','reservation_status_date','date_difference','stays_total_nights'],axis=1,inplace=True)
    test.drop(['country','company'],axis=1,inplace=True)
    # children 補眾數(補 0)
    booking_total['children'].fillna(0,inplace=True)
    # agent 沒有的補 0
    # booking_total['agent'].fillna(0,inplace=True)
    # test['agent'].fillna(0,inplace=True)
    booking_total.drop(['agent'],axis=1,inplace=True)
    test.drop(['agent'],axis=1,inplace=True)

    # datatype 轉成一樣
    booking_total.hotel =booking_total.hotel.astype('category')
    test.hotel = test.hotel.astype('category')
    booking_total.children = booking_total["children"].astype(int)
    test.children = test["children"].astype(int)
    # meal 的 undefined 和 SC 是一樣的
    booking_total['meal'].replace('Undefined','SC',inplace=True)
    test['meal'].replace('Undefined','SC',inplace=True)
    X_booking = pd.get_dummies(data=booking_total,columns=["hotel","arrival_date_week_number","arrival_date_year","arrival_date_day_of_month","arrival_date_month","meal","market_segment",
    "distribution_channel","reserved_room_type","assigned_room_type","deposit_type","customer_type"])
    test = pd.get_dummies(data=test,columns=["hotel","arrival_date_week_number","arrival_date_year","arrival_date_day_of_month","arrival_date_month","meal","market_segment",
    "distribution_channel","reserved_room_type","assigned_room_type","deposit_type","customer_type"])
    X_booking['assigned_room_type_P'] = 0
    X_booking['reserved_room_type_P'] = 0
    test['reserved_room_type_L'] = 0
    test['arrival_date_month_01'] = 0
    test['arrival_date_month_02'] = 0
    test['arrival_date_month_03'] = 0
    test['arrival_date_month_09'] = 0
    test['arrival_date_month_10'] = 0
    test['arrival_date_month_11'] = 0
    test['arrival_date_month_12'] = 0
    test['distribution_channel_Undefined'] = 0
    X_booking_ret = copy.deepcopy(X_booking)
    test_ret = copy.deepcopy(test)
    X_booking.drop(['arrival_date','ID'],axis=1,inplace=True)
    test.drop(['arrival_date','ID'],axis=1,inplace=True)
    X = X_booking.drop(['booking_total_revenue','adr'],axis=1).values
    test = test.values
    Y = X_booking['adr']
    return X,Y,test,X_booking_ret,test_ret

In [5]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [None]:
X,Y,test,X_booking_ret,test_ret = pre_adr_model(train,test)

In [None]:
X_booking_ret.groupby('arrival_date').sum().drop(['ID','adr','booking_total_revenue'],axis=1).to_csv('train_new.csv')

In [None]:
test_ret.groupby('arrival_date').sum().drop(['ID'],axis=1).to_csv('test_new.csv')