In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import json
from datetime import datetime, timedelta
import xgboost as xgb

# 讀取等候時間數據
with open('/Users/chianlee/Desktop/disney/data/day_hour_avg_data_sea.json') as f:
    wait_times = pd.read_json(f, lines=True)

# # 檢查等候時間數據
# print("Wait times head:")
# print(wait_times.head())

# 讀取遊客量預測數據
with open('/Users/chianlee/Desktop/disney/data/disneysea_predict.json') as f:
    visitor_predictions = pd.read_json(f)

# 將時間戳轉換為日期
wait_times['Date'] = pd.to_datetime(wait_times['Date'], unit='ms')
wait_times['Day'] = wait_times['Date'].dt.normalize()  # 確保日期部分無時間

# # 檢查等候時間數據
# print("Wait times after date conversion:")
# print(wait_times.head())

# 將 visitor_predictions 中的日期轉換為 datetime 格式
visitor_predictions['date'] = pd.to_datetime(visitor_predictions['date']).dt.normalize()

# 篩選出與 wait_times 日期範圍匹配的 visitor_predictions
min_date = wait_times['Day'].min()
max_date = wait_times['Day'].max()
filtered_visitor_predictions = visitor_predictions[(visitor_predictions['date'] >= min_date) & (visitor_predictions['date'] <= max_date)]

# # 檢查篩選後的遊客量預測數據
# print("Filtered visitor predictions head:")
# print(filtered_visitor_predictions.head())

# # 檢查唯一值，確認匹配
# print("Unique days in wait_times:")
# print(wait_times['Day'].unique())
# print("Unique dates in filtered_visitor_predictions:")
# print(filtered_visitor_predictions['date'].unique())

# 擴展 filtered_visitor_predictions 為每個日期生成所有小時
expanded_predictions = []
for _, row in filtered_visitor_predictions.iterrows():
    for hour in range(8, 22):  # 生成從 8 到 21 小時的數據
        expanded_predictions.append({
            'region': row['region'],
            'date': row['date'],
            'weekday': row['weekday'],
            'prediction': row['prediction'],
            'Hour': hour
        })

expanded_predictions = pd.DataFrame(expanded_predictions)

# # 檢查擴展後的 visitor_predictions
# print("Expanded visitor predictions head:")
# print(expanded_predictions.head())

# 合併數據，確保合併的鍵是正確的
data = wait_times.merge(expanded_predictions, left_on=['Day', 'Hour'], right_on=['date', 'Hour'], how='left')

# # 檢查合併後的數據
# print("Merged data head:")
# print(data.head())

# 將星期幾分類為週間（1）和週末（0）
weekday_map = {
    'Monday': 1, 'Tuesday': 1, 'Wednesday': 1, 'Thursday': 1, 'Friday': 1, 
    'Saturday': 0, 'Sunday': 0
}
data['IsWeekend'] = data['weekday'].map(weekday_map)

# 將星期幾轉換為數字
weekday_to_num = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 
    'Saturday': 5, 'Sunday': 6
}
data['DayOfWeek'] = data['weekday'].map(weekday_to_num)

# 將時間劃分為不同的時間窗口
def assign_time_window(hour):
    if 8 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 22:
        return 'Evening'
    else:
        return 'Unknown'

data['TimeWindow'] = data['Hour'].apply(assign_time_window)
data['TimeWindow'] = data['TimeWindow'].astype('category').cat.codes  # 將時間窗口轉換為數值編碼

# # 檢查合併後的數據
# print("Data after adding IsWeekend and DayOfWeek:")
# print(data.head())

# 將設施名稱轉換為分類編碼
data['FacilityCode'] = data['FacilityEnglish'].astype('category').cat.codes
facility_mapping = dict(enumerate(data['FacilityEnglish'].astype('category').cat.categories))

# # 檢查轉換後的數據
# print("Facility mapping:")
# print(data[['FacilityEnglish', 'FacilityCode']].drop_duplicates())

In [2]:
# 特徵和標籤
features = ['Hour', 'prediction', 'IsWeekend', 'DayOfWeek', 'FacilityCode', 'TimeWindow']
X = data[features]
y = data['StandbyTime']

# 檢查特徵的變異性
print("Feature description:")
print(X.describe())

Feature description:
              Hour   prediction    IsWeekend    DayOfWeek  FacilityCode  \
count  3744.000000  3744.000000  3744.000000  3744.000000   3744.000000   
mean     14.000000    21.333333     0.777778     2.888889     15.500000   
std       3.742157     1.764070     0.415795     1.791852      9.234326   
min       8.000000    19.000000     0.000000     0.000000      0.000000   
25%      11.000000    21.000000     1.000000     2.000000      7.750000   
50%      14.000000    21.000000     1.000000     3.000000     15.500000   
75%      17.000000    21.000000     1.000000     4.000000     23.250000   
max      20.000000    26.000000     1.000000     6.000000     31.000000   

        TimeWindow  
count  3744.000000  
mean      0.846154  
std       0.863575  
min       0.000000  
25%       0.000000  
50%       1.000000  
75%       2.000000  
max       2.000000  


In [3]:
# 切分數據
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# 切分數據
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#XGBoost
model = xgb.XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)

# 將預測值限制在0以上
y_pred = np.maximum(y_pred, 0)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, RMSE: {rmse}')

MSE: 34.83880793357865, RMSE: 5.902440845411214


In [5]:
import pandas as pd
import numpy as np
import json

def predict_future_wait_times_range(start_date_str, end_date_str):
    # 將輸入的日期字符串轉換為 datetime 格式
    start_date = pd.to_datetime(start_date_str).normalize()
    end_date = pd.to_datetime(end_date_str).normalize()

    # 生成日期範圍
    date_range = pd.date_range(start=start_date, end=end_date)

    all_predictions = []

    for date in date_range:
        visitor_prediction = visitor_predictions[visitor_predictions['date'] == date]
        if visitor_prediction.empty:
            print(f"No visitor prediction found for date: {date.strftime('%Y-%m-%d')}")
            continue

        prediction_value = visitor_prediction['prediction'].values[0]
        is_weekend = visitor_prediction['weekday'].map(weekday_map).values[0]
        day_of_week = visitor_prediction['weekday'].map(weekday_to_num).values[0]

        facilities = data['FacilityCode'].unique()

        future_data = []
        for facility in facilities:
            for hour in range(8, 22):
                time_window = assign_time_window(hour)
                future_data.append({
                    'Hour': hour,
                    'prediction': prediction_value,
                    'IsWeekend': is_weekend,
                    'DayOfWeek': day_of_week,
                    'FacilityCode': facility,
                    'TimeWindow': time_window,
                })

        future_data = pd.DataFrame(future_data)
        future_data['TimeWindow'] = future_data['TimeWindow'].astype('category').cat.codes

        print(f"Future data description for {date.strftime('%Y-%m-%d')}:")
        print(future_data.describe())

        # 預測之前移除Date欄位
        future_wait_times = model.predict(future_data)

        future_wait_times = np.maximum(future_wait_times, 0)

        future_data['PredictedWaitTime'] = future_wait_times
        future_data['FacilityEnglish'] = future_data['FacilityCode'].map(facility_mapping)
        future_data['Date'] = date.strftime('%Y-%m-%d')
        future_data = future_data[['Date', 'FacilityEnglish', 'Hour', 'PredictedWaitTime']]

        all_predictions.extend(future_data.to_dict(orient='records'))

    output_file = f'/Users/chianlee/Desktop/disney/data/future_sea_{start_date_str}_to_{end_date_str}.json'
    with open(output_file, 'w') as f:
        json.dump(all_predictions, f, indent=4)

    print(f"預測結果已存儲到 {output_file}")

In [6]:
# 示例：預測日期範圍的等候時間
predict_future_wait_times_range('2024-08-31', '2024-09-02')

Future data description for 2024-08-31:
             Hour  prediction  IsWeekend  DayOfWeek  FacilityCode  TimeWindow
count  448.000000       448.0      448.0      448.0    448.000000  448.000000
mean    14.500000        20.0        0.0        5.0     15.500000    0.857143
std      4.035635         0.0        0.0        0.0      9.243415    0.833924
min      8.000000        20.0        0.0        5.0      0.000000    0.000000
25%     11.000000        20.0        0.0        5.0      7.750000    0.000000
50%     14.500000        20.0        0.0        5.0     15.500000    1.000000
75%     18.000000        20.0        0.0        5.0     23.250000    2.000000
max     21.000000        20.0        0.0        5.0     31.000000    2.000000
Future data description for 2024-09-01:
             Hour  prediction  IsWeekend  DayOfWeek  FacilityCode  TimeWindow
count  448.000000       448.0      448.0      448.0    448.000000  448.000000
mean    14.500000        20.0        0.0        6.0     15.500

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, RMSE: {rmse}')

In [None]:
from sklearn.svm import SVR

model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, RMSE: {rmse}')

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, RMSE: {rmse}')

In [None]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, RMSE: {rmse}')

In [None]:
#Stacking(決策樹+XGBoost)
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# 定義基模型
base_models = [
    ('xgb', xgb.XGBRegressor(n_estimators=100, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
]

# 定義元模型
meta_model = LinearRegression()

# 定義堆疊模型
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# 訓練堆疊模型
stacking_model.fit(X_train, y_train)

# 預測和評估
y_pred = stacking_model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, RMSE: {rmse}')

In [None]:
#決策樹
# 訓練模型
model = RandomForestRegressor(n_estimators=100, random_state=1)
model.fit(X_train, y_train)

# 檢查特徵重要性
feature_importances = model.feature_importances_
feature_names = features
print("Feature importances:")
for name, importance in zip(feature_names, feature_importances):
    print(f'{name}: {importance}')
    
# 預測
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, RMSE: {rmse}')

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

# 切分數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定義參數網格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# 定義模型和GridSearchCV
xgb_model = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# 執行GridSearchCV
grid_search.fit(X_train, y_train)

# 最佳參數和最佳模型
best_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_

# 使用最佳模型進行預測
y_pred = best_xgb_model.predict(X_test)
y_pred = np.maximum(y_pred, 0)

# 評估模型性能
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Best params: {best_params}')
print(f'MSE: {mse}, RMSE: {rmse}')