In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib

# 初始化資料集列表
all_features_list = []
all_labels_list = []

batch_size = 1000
counter = 0

# 遍歷所有 CSV 檔案
for i in range(500101001, 500119092):
    file_path = f'rentbike/{i}.csv'
    if not os.path.exists(file_path):
        continue

    # 讀取數據
    df = pd.read_csv(file_path)

    # 選取初始特徵
    df = df[['sno', 'total', 'latitude', 'longitude', 'act', 'srcUpdateTime', 'available_rent_bikes']]

    # 轉換 infoTime 為 datetime
    df['srcUpdateTime'] = pd.to_datetime(df['srcUpdateTime'])

    # 提取時間特徵
    df['hour'] = df['srcUpdateTime'].dt.hour
    df['minute'] = df['srcUpdateTime'].dt.minute
    df['second'] = df['srcUpdateTime'].dt.second
    df['weekday'] = df['srcUpdateTime'].dt.weekday  # 週幾

    # 擴展後的特徵集
    features = df[['sno', 'total', 'latitude', 'longitude', 'act', 'hour', 'minute', 'second', 'weekday']]
    labels = df['available_rent_bikes']  # 預測 available_rent_bikes

    all_features_list.append(features)
    all_labels_list.append(labels)
    counter += 1

    # 每 batch_size 次進行一次合併，減少內存壓力
    if counter % batch_size == 0:
        all_features = pd.concat(all_features_list, ignore_index=True)
        all_labels = pd.concat(all_labels_list, ignore_index=True)
        all_features_list = []
        all_labels_list = []
        print(f"Processed {counter} files.")

# 最後一批資料進行合併
if counter % batch_size != 0:
    all_features = pd.concat(all_features_list, ignore_index=True)
    all_labels = pd.concat(all_labels_list, ignore_index=True)

# 分割資料集
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)

# GBM 模型的網格搜索參數
gbm_param_grid = {
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'max_depth': [5, 10, 15, 20, 25]
}

# 交叉驗證和網格搜索 GBM 模型
gbm = GradientBoostingRegressor(random_state=42)
gbm_grid_search = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbm_grid_search.fit(X_train, y_train)

# 最佳參數和評估 GBM 模型
best_gbm = gbm_grid_search.best_estimator_
y_pred_gbm = best_gbm.predict(X_test)
mse_gbm = mean_squared_error(y_test, y_pred_gbm)
rmse_gbm = mean_squared_error(y_test, y_pred_gbm, squared=False)
mae_gbm = mean_absolute_error(y_test, y_pred_gbm)
r2_gbm = r2_score(y_test, y_pred_gbm)
print(f"GBM Mean Squared Error: {mse_gbm}")
print(f"GBM Root Mean Squared Error: {rmse_gbm}")
print(f"GBM Mean Absolute Error: {mae_gbm}")
print(f"GBM R^2 Score: {r2_gbm}")
print(f"GBM Best Parameters: {gbm_grid_search.best_params_}")
joblib.dump(best_gbm, 'rent_bike_gbm_model.pkl')


xgb_param_grid = {
              'max_depth': [5, 10, 15, 20, 25],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'min_child_weight': [0, 2, 5, 10, 20],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]

}

# 交叉驗證和網格搜索 XGBoost 模型
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)

# 最佳參數和評估 XGBoost 模型
best_xgb = xgb_grid_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost Mean Squared Error: {mse_xgb}")
print(f"XGBoost Root Mean Squared Error: {rmse_xgb}")
print(f"XGBoost Mean Absolute Error: {mae_xgb}")
print(f"XGBoost R^2 Score: {r2_xgb}")
print(f"XGBoost Best Parameters: {xgb_grid_search.best_params_}")
joblib.dump(best_xgb, 'rent_bike_xgb_model.pkl')


In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib

# 初始化資料集列表
all_features_list = []
all_labels_list = []

batch_size = 1000
counter = 0

# 遍歷所有 CSV 檔案
for i in range(500101001, 500119092):
    file_path = f'returnbike/{i}.csv'
    if not os.path.exists(file_path):
        continue
    
    # 讀取數據
    df = pd.read_csv(file_path)

    # 選取初始特徵
    df = df[['sno', 'total', 'latitude', 'longitude', 'act', 'srcUpdateTime', 'available_return_bikes']]

    # 轉換 infoTime 為 datetime
    df['srcUpdateTime'] = pd.to_datetime(df['srcUpdateTime'])

    # 提取時間特徵
    df['hour'] = df['srcUpdateTime'].dt.hour
    df['minute'] = df['srcUpdateTime'].dt.minute
    df['second'] = df['srcUpdateTime'].dt.second
    df['weekday'] = df['srcUpdateTime'].dt.weekday  # 週幾

    # 擴展後的特徵集
    features = df[['sno', 'total', 'latitude', 'longitude', 'act', 'hour', 'minute', 'second', 'weekday']]
    labels = df['available_return_bikes']  # 預測 available_rent_bikes

    all_features_list.append(features)
    all_labels_list.append(labels)
    counter += 1
    
    # 每 batch_size 次進行一次合併，減少內存壓力
    if counter % batch_size == 0:
        all_features = pd.concat(all_features_list, ignore_index=True)
        all_labels = pd.concat(all_labels_list, ignore_index=True)
        all_features_list = []
        all_labels_list = []
        print(f"Processed {counter} files.")

# 最後一批資料進行合併
if counter % batch_size != 0:
    all_features = pd.concat(all_features_list, ignore_index=True)
    all_labels = pd.concat(all_labels_list, ignore_index=True)

# 分割資料集
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)

# GBM 模型的網格搜索參數
gbm_param_grid = {
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'max_depth': [5, 10, 15, 20, 25]
}

# 交叉驗證和網格搜索 GBM 模型
gbm = GradientBoostingRegressor(random_state=42)
gbm_grid_search = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbm_grid_search.fit(X_train, y_train)

# 最佳參數和評估 GBM 模型
best_gbm = gbm_grid_search.best_estimator_
y_pred_gbm = best_gbm.predict(X_test)
mse_gbm = mean_squared_error(y_test, y_pred_gbm)
rmse_gbm = mean_squared_error(y_test, y_pred_gbm, squared=False)
mae_gbm = mean_absolute_error(y_test, y_pred_gbm)
r2_gbm = r2_score(y_test, y_pred_gbm)
print(f"GBM Mean Squared Error: {mse_gbm}")
print(f"GBM Root Mean Squared Error: {rmse_gbm}")
print(f"GBM Mean Absolute Error: {mae_gbm}")
print(f"GBM R^2 Score: {r2_gbm}")
print(f"GBM Best Parameters: {gbm_grid_search.best_params_}")
joblib.dump(best_gbm, 'return_bike_gbm_model.pkl')


xgb_param_grid = {
              'max_depth': [5, 10, 15, 20, 25],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'min_child_weight': [0, 2, 5, 10, 20],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]

}

# 交叉驗證和網格搜索 XGBoost 模型
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)

# 最佳參數和評估 XGBoost 模型
best_xgb = xgb_grid_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost Mean Squared Error: {mse_xgb}")
print(f"XGBoost Root Mean Squared Error: {rmse_xgb}")
print(f"XGBoost Mean Absolute Error: {mae_xgb}")
print(f"XGBoost R^2 Score: {r2_xgb}")
print(f"XGBoost Best Parameters: {xgb_grid_search.best_params_}")
joblib.dump(best_xgb, 'return_bike_xgb_model.pkl')
