In [None]:
# Import Training and Testing Data
import pandas as pd

online_booking_train = pd.read_csv("train-online_booking_2020.csv")
online_booking_test = pd.read_csv("test-online_booking_2021.csv")

In [None]:
# Checking Missing Value from the Data
df1 = online_booking_train.isna().sum()
df2 = online_booking_test.isna().sum()
print("Data testing:\n", df1, "\n")
print("Data training:\n", df2)

In [None]:
# Making the Used Room Dataset
online_booking_train['room_used'] = online_booking_train['room_total'] - online_booking_train['all_available_room']
online_booking_test['room_used'] = online_booking_test['room_total'] - online_booking_test['all_available_room']

In [None]:
# Convert the Type of 'tanggal' Column into Datetime Type
online_booking_train['tanggal'] = pd.to_datetime(online_booking_train['tanggal'])
online_booking_test['tanggal'] = pd.to_datetime(online_booking_test['tanggal'])

In [None]:
print(online_booking_train.head())
print(online_booking_test.head())

In [None]:
# Visualize the Train Data Depend On the Time Aggregate
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
online_booking_train.groupby('tanggal')['room_used'].sum().plot()
plt.title("Jumlah Ruang yang Digunakan Berdasarkan Waktu", fontsize=15)
plt.xlabel("Waktu")
plt.ylabel("Jumlah")
plt.show()

In [None]:
# Setting the Datetime Dataset ('tanggal') Column to Only Group of Month
agg_train_online_booking = online_booking_train.set_index('tanggal').resample('M').mean()
agg_test_online_booking = online_booking_test.set_index('tanggal').resample('M').mean()

In [None]:
agg_train_online_booking['tpk_online'] = agg_train_online_booking['room_used']/agg_train_online_booking['room_total']*100
agg_test_online_booking['tpk_online'] = agg_test_online_booking['room_used']/agg_test_online_booking['room_total']*100

In [None]:
# Reset the Data Index
agg_train_online_booking = agg_train_online_booking.reset_index()
agg_test_online_booking = agg_test_online_booking.reset_index()

In [None]:
# Set the ID depend on Time Column
agg_train_online_booking['Id'] = pd.DatetimeIndex(agg_train_online_booking['tanggal']).month
agg_test_online_booking['Id'] = pd.DatetimeIndex(agg_test_online_booking['tanggal']).month

In [None]:
# Import TPK Dataset
tpk_hotel_berbintang_train = pd.read_csv("train-TPK_Hotel_berbintang_2020.csv")
tpk_hotel_berbintang_test = pd.read_csv("test-TPK_Hotel_berbintang_2021.csv")

In [None]:
# Merge All of The Data into a Completed Training and Testing Dataset
df_full_train = pd.merge(agg_train_online_booking, tpk_hotel_berbintang_train, on='Id', how='left')
df_full_test = pd.merge(agg_test_online_booking, tpk_hotel_berbintang_test, on='Id', how='left')

In [None]:
print(df_full_train.head())
print(df_full_test.head())

In [None]:
# Fill the Aggregate Dataset using The Mean of the Used Room
agg_value_train = df_full_train.groupby('tanggal')['room_used'].mean()
agg_value_test = df_full_test.groupby('tanggal')['room_used'].mean()

for x in range(0, 12):
    df_full_train.loc[:x, 'Aggregate_var'].fillna(agg_value_train[x], inplace=True)

for y in range(0, 6):
    df_full_test.loc[:y, 'Aggregate'].fillna(agg_value_test[y], inplace=True)

In [None]:
# Making Training and Testing Dataset
train_ds = df_full_train[['tpk_online','TPK']]
test_ds = df_full_test[['tpk_online', 'TPK']]

In [None]:
# Making Training and Testing Dataset
training = df_full_train[['Aggregate_var', 'tpk_online','TPK']]
testing = df_full_test[['Aggregate', 'tpk_online', 'TPK']]

In [None]:
X_training = train_ds['tpk_online']
y_training = train_ds['TPK']

In [None]:
X_train_ds = training.drop('TPK', axis=1)
y_train_ds = training['TPK']

In [None]:
X_train_ds.shape

In [None]:
# Checking the Normalization of Data
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))
sns.displot(test_ds, x='tpk_online')
plt.show()

In [None]:
# Checking The Outlier
plt.figure(figsize=(10,8))
train_ds.boxplot()
plt.show()

In [None]:
# Making the Model
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt

ridge = Ridge()

In [None]:
ridge.get_params()

In [None]:
grid = GridSearchCV(ridge, param_grid=dict(
                                            alpha=np.logspace(3.16, 4, 5),
                                            solver=['auto', 'svd', 'cholesky', 'lsqr']),
                                            n_jobs=-1)
model = grid.fit(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1))

y_pred = model.predict(X_training.values.reshape(-1,1))
print("Model score:", model.score(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1)))

In [None]:
# Making the Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt

dtr = LinearRegression()

In [None]:
model = dtr.fit(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1))

y_pred = model.predict(X_training.values.reshape(-1,1))
print("Model score:", model.score(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1)))

In [None]:
# Making the Model
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt

dtr = SVR()

In [None]:
grid = GridSearchCV(dtr, param_grid=dict(
                                        C=np.linspace(1, 10, 10),
                                        kernel=['linear']),
                                        n_jobs=-1
                                        )
model = grid.fit(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1))

y_pred = model.predict(X_training.values.reshape(-1,1))
print("Model score:", model.score(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1)))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfc = RandomForestRegressor()

In [None]:
rfc.get_params()

In [None]:
grid = GridSearchCV(rfc, param_grid=dict(
                                        ccp_alpha=np.linspace(1, 10, 10),
                                        criterion=['mse', 'mae'],
                                        n_jobs=[1,2,3,4,5,6,7,8,9]),
                                        n_jobs=-1
                                        )
model = grid.fit(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1))

y_pred = model.predict(X_training.values.reshape(-1,1))
print("Model score:", model.score(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1)))

In [None]:
# Check the Root Mean Squared Error of the Model
from sklearn.metrics import mean_squared_error
import numpy as np

mse = mean_squared_error(X_training, y_pred)
rmse = np.sqrt(mse)
print(rmse)

In [None]:
# Predict the Testing TPK Dataset
tpk_predict = model.predict(test_ds['tpk_online'].values.reshape(-1,1))
tpk_predict = tpk_predict.ravel()
tpk_predict

In [None]:
# Fill the Resul of the TPK Prediction
for y in range(0, 6):
    df_full_test.loc[:y, 'TPK'].fillna(tpk_predict[y], inplace=True)

In [None]:
df_full_test['TPK']

In [None]:
hasil = pd.DataFrame()
hasil['Id'] = df_full_test['Id']
hasil['TPK'] = df_full_test['TPK']
hasil[['Id', 'TPK']]

In [None]:
hasil[['Id', 'TPK']].to_csv('HASIL_MODEL.csv', index=False)

In [None]:
# Checking the Confusion Matrix of the Model
from yellowbrick.regressor import PredictionError

pe = PredictionError(dtr)
pe.fit(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1))
pe.score(X_training.values.reshape(-1,1), y_training.values.reshape(-1,1))
pe.show()