In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [138]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

In [130]:
train = pd.read_csv('final/train_final.csv')
test = pd.read_csv('final/test_final.csv')

In [131]:
reservation_ID = test['reservation_id']
train.drop(['reservation_id'], axis = 1, inplace = True)
test.drop(['reservation_id'], axis = 1, inplace = True)

In [132]:
X = train.drop(['amount_spent_per_room_night_scaled'], axis = 1)
y = train['amount_spent_per_room_night_scaled']

## 1. Gradient Boosting Regressor

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [193]:
clf = GradientBoostingRegressor(n_estimators = 300, min_samples_split = 75, max_depth = 5, learning_rate = 0.20,
                                verbose = 1, random_state = 42)

In [194]:
clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.1411            7.18m
         2           1.1091            7.06m
         3           1.0871            7.03m
         4           1.0704            7.17m
         5           1.0577            7.44m
         6           1.0478            7.51m
         7           1.0404            7.33m
         8           1.0336            7.23m
         9           1.0269            7.17m
        10           1.0226            7.11m
        20           0.9979            6.55m
        30           0.9869            6.85m
        40           0.9815            6.80m
        50           0.9776            6.43m
        60           0.9745            6.17m
        70           0.9725            5.76m
        80           0.9703            5.31m
        90           0.9686            5.01m
       100           0.9668            4.78m
       200           0.9560            2.10m
       300           0.9488            0.00s


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.2, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=75, min_weight_fraction_leaf=0.0,
             n_estimators=300, n_iter_no_change=None, presort='auto',
             random_state=42, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=1, warm_start=False)

In [195]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [196]:
rmse

0.9895906595704808

In [55]:
prediction = clf.predict(test)

In [56]:
output = pd.DataFrame({'reservation_id':reservation_ID, 'amount_spent_per_room_night_scaled':prediction})
output.to_csv('submissions/submission-gradient-boosting-regressor.csv', index=False)

## 2. Random Forest Regressor

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [79]:
clf = RandomForestRegressor(n_estimators = 200, min_samples_split = 50, verbose = 1, random_state = 42)

In [80]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  6.2min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=50,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=42, verbose=1, warm_start=False)

In [81]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    2.8s finished


In [82]:
rmse

0.9984893130557083

In [83]:
prediction = clf.predict(test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    8.4s finished


In [84]:
output = pd.DataFrame({'reservation_id':reservation_ID, 'amount_spent_per_room_night_scaled':prediction})
output.to_csv('submissions/submission-random-forest-regressor.csv', index=False)

## 3. AdaBoost Regressor

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [104]:
clf = AdaBoostRegressor(n_estimators = 100, random_state = 42)

In [105]:
clf.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=100, random_state=42)

In [106]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [107]:
rmse

1.2321309185049383

## 4. MLP Regressor

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [120]:
clf = MLPRegressor(early_stopping = True, learning_rate='invscaling', 
                   max_iter=200, random_state=42, shuffle=True, tol = 0.0001, verbose=1)

In [121]:
clf.fit(X_train, y_train)

Iteration 1, loss = 1.04294906
Validation score: 0.025911
Iteration 2, loss = 0.54944828
Validation score: 0.109290
Iteration 3, loss = 0.52564282
Validation score: 0.105842
Iteration 4, loss = 0.51818179
Validation score: 0.124950
Iteration 5, loss = 0.51393778
Validation score: 0.146119
Iteration 6, loss = 0.51130553
Validation score: 0.139821
Iteration 7, loss = 0.50859418
Validation score: 0.149708
Iteration 8, loss = 0.50792961
Validation score: 0.108017
Iteration 9, loss = 0.50781143
Validation score: 0.058967
Iteration 10, loss = 0.50634918
Validation score: 0.154031
Iteration 11, loss = 0.50654962
Validation score: 0.152028
Iteration 12, loss = 0.50479624
Validation score: 0.051739
Iteration 13, loss = 0.50500847
Validation score: 0.142517
Iteration 14, loss = 0.50390217
Validation score: 0.135421
Iteration 15, loss = 0.50410837
Validation score: 0.151552
Iteration 16, loss = 0.50345619
Validation score: 0.133768
Iteration 17, loss = 0.50321182
Validation score: 0.145121
Iterat

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=42, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=1, warm_start=False)

In [122]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [119]:
rmse

0.9991924202403251

In [123]:
prediction = clf.predict(test)

In [124]:
output = pd.DataFrame({'reservation_id':reservation_ID, 'amount_spent_per_room_night_scaled':prediction})
output.to_csv('submissions/submission-mlp-regressor.csv', index=False)

## SGD Regressor

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [172]:
clf = SGDRegressor(random_state=42, verbose=1, max_iter = 20)

In [173]:
clf.fit(X_train, y_train)



-- Epoch 1
Norm: 56.84, NNZs: 18, Bias: 430.383147, T: 290210, Avg. loss: 869580153.677629
Total training time: 0.10 seconds.
-- Epoch 2
Norm: 14.25, NNZs: 18, Bias: 109.821612, T: 580420, Avg. loss: 407.747694
Total training time: 0.20 seconds.
-- Epoch 3
Norm: 4.16, NNZs: 18, Bias: 37.331679, T: 870630, Avg. loss: 26.849759
Total training time: 0.30 seconds.
-- Epoch 4
Norm: 1.39, NNZs: 18, Bias: 17.435725, T: 1160840, Avg. loss: 2.880550
Total training time: 0.40 seconds.
-- Epoch 5
Norm: 0.64, NNZs: 18, Bias: 11.306596, T: 1451050, Avg. loss: 0.818021
Total training time: 0.50 seconds.
-- Epoch 6
Norm: 0.44, NNZs: 18, Bias: 9.277767, T: 1741260, Avg. loss: 0.600652
Total training time: 0.60 seconds.
-- Epoch 7
Norm: 0.39, NNZs: 18, Bias: 8.570408, T: 2031470, Avg. loss: 0.573803
Total training time: 0.70 seconds.
-- Epoch 8
Norm: 0.38, NNZs: 18, Bias: 8.302666, T: 2321680, Avg. loss: 0.568844
Total training time: 0.80 seconds.
-- Epoch 9
Norm: 0.36, NNZs: 18, Bias: 8.207255, T: 261

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=20,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=1, warm_start=False)

In [174]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [175]:
rmse

1.0560609536427747

In [123]:
prediction = clf.predict(test)

In [124]:
output = pd.DataFrame({'reservation_id':reservation_ID, 'amount_spent_per_room_night_scaled':prediction})
output.to_csv('submissions/submission-mlp-regressor.csv', index=False)