In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [19]:
date_parser = lambda x: pd.to_datetime(x, format="%d/%m/%y")

raw = pd.read_csv('train.csv', parse_dates=['checkin_date','checkout_date'], 
                  date_parser=date_parser,
                 na_values = ['nan'])

testData = pd.read_csv('test.csv', parse_dates=['checkin_date','checkout_date'], 
                  date_parser=date_parser,
                 na_values = ['nan'])

raw = raw.dropna()

In [20]:
trainData = raw.drop(['booking_date'], axis = 1)
testData = testData.drop(['booking_date'], axis = 1)

In [21]:
import datetime

days = raw.checkout_date - raw.checkin_date
trainData['days_stayed'] = days.dt.days.values
trainData['month'] = raw.checkin_date.dt.month.values
trainData.drop(['checkin_date', 'checkout_date','channel_code','memberid'], axis = 1, inplace=True)

days = testData.checkout_date - testData.checkin_date
testData['days_stayed'] = days.dt.days.values
testData['month'] = testData.checkin_date.dt.month.values
testData.drop(['checkin_date', 'checkout_date','channel_code','memberid'], axis = 1, inplace=True)

In [22]:
enc = LabelEncoder()

col_enc = enc.fit(raw.resort_id)
trainData.resort_id = enc.transform(raw.resort_id)
testData.resort_id = enc.transform(testData.resort_id)

col_enc = enc.fit(raw.member_age_buckets)
trainData.member_age_buckets = enc.transform(raw.member_age_buckets)
testData.member_age_buckets = enc.transform(testData.member_age_buckets)

col_enc = enc.fit(raw.cluster_code)
trainData.cluster_code = enc.transform(raw.cluster_code)
testData.cluster_code = enc.transform(testData.cluster_code)

col_enc = enc.fit(raw.reservationstatusid_code)
trainData.reservationstatusid_code = enc.transform(raw.reservationstatusid_code)
testData.reservationstatusid_code = enc.transform(testData.reservationstatusid_code)

In [23]:
trainData['persons'] = trainData['numberofadults'] + .65*trainData['numberofchildren']
testData['persons'] = testData['numberofadults'] + .65*testData['numberofchildren']

stats = trainData.describe()

In [24]:
trainData[trainData.days_stayed>(stats['days_stayed']['mean'] + 3.5*stats['days_stayed']['std'])] = np.nan
trainData[trainData.days_stayed<(stats['days_stayed']['mean'] - 3.5*stats['days_stayed']['std'])] = np.nan

trainData[trainData.roomnights>(stats['roomnights']['mean'] + 3.5*stats['roomnights']['std'])] = np.nan
trainData[trainData.roomnights<(stats['roomnights']['mean'] - 3.5*stats['roomnights']['std'])] = np.nan

trainData[trainData.total_pax>(stats['total_pax']['mean'] + 3.5*stats['total_pax']['std'])] = np.nan
trainData[trainData.total_pax<(stats['total_pax']['mean'] - 3.5*stats['total_pax']['std'])] = np.nan

trainData[trainData.persons>(stats['persons']['mean'] + 3.5*stats['persons']['std'])] = np.nan
trainData[trainData.persons<(stats['persons']['mean'] - 3.5*stats['persons']['std'])] = np.nan

trainData[trainData.amount_spent_per_room_night_scaled>(stats['amount_spent_per_room_night_scaled']['mean'] + 3.5*stats['amount_spent_per_room_night_scaled']['std'])] = np.nan
trainData[trainData.amount_spent_per_room_night_scaled<(stats['amount_spent_per_room_night_scaled']['mean'] - 3.5*stats['amount_spent_per_room_night_scaled']['std'])] = np.nan

trainData.dropna(inplace=True)

In [25]:
X_train = trainData.drop(['amount_spent_per_room_night_scaled','reservation_id','season_holidayed_code','state_code_residence','reservationstatusid_code'], axis=1)
y_train = trainData.amount_spent_per_room_night_scaled

X_test = testData.drop(['reservation_id','season_holidayed_code','state_code_residence','reservationstatusid_code'], axis=1)
test_reservation_id = testData.reservation_id

In [26]:
def saveFile(file_name):
    result = pd.DataFrame({'reservation_id':test_reservation_id,'amount_spent_per_room_night_scaled':y_pred})

    result = result[['reservation_id','amount_spent_per_room_night_scaled']]
    result.to_csv(file_name,index=False)
    print('Saved file: ' + file_name)

In [27]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=2,n_estimators=25, max_depth=12)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
saveFile('RandomForestRegressor.csv')

Saved file: RandomForestRegressor.csv


In [28]:
from xgboost import  XGBRegressor

model = XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = .2,
                max_depth = 20, alpha = 100, n_estimators = 24)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
saveFile('XGBRegressor.csv')

Saved file: XGBRegressor.csv


In [29]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

In [30]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(50, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(100, kernel_initializer='normal',activation='selu'))
NN_model.add(Dense(100, kernel_initializer='normal',activation='selu'))
NN_model.add(Dense(100, kernel_initializer='normal',activation='selu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
NN_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 50)                900       
_________________________________________________________________
dense_7 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101       
Total params: 26,301
Trainable params: 26,301
Non-trainable params: 0
_________________________________________________________________


In [None]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [None]:
NN_model.fit(X_train, y_train, epochs=10, batch_size=50, validation_split = 0.3, callbacks=callbacks_list)

Train on 228246 samples, validate on 97821 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.00992, saving model to Weights-001--1.00992.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 1.00992 to 0.97782, saving model to Weights-002--0.97782.hdf5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.97782
Epoch 4/10

In [None]:
wights_file = 'Weights-010--0.87062.hdf5' # choose the best checkpoint 
NN_model.load_weights(wights_file) # load it
NN_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

y_pred = NN_model.predict(X_test)
y_pred = y_pred.flatten()
saveFile('NueralNetwork.csv')