In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the processed dataset (Parquet format)
df = pd.read_parquet('data/processed/processed_data.parquet')

# Define features and target variables
features = ['start_soc', 'end_soc', 'duration', 'location_latitude', 'location_longitude', 
            'is_weekend', 'trip_count_per_weekday', 'avg_distance_per_dayofweek', 
            'mean_consumption', 'mean_duration', 'mean_dep_time', 'charging_method']
target_energy = 'energy_need'
target_departure = 'departure_time'

X = df[features]
y_energy = df[target_energy]
y_departure = df[target_departure]

# Split the data into training and testing sets
X_train, X_test, y_train_energy, y_test_energy = train_test_split(X, y_energy, test_size=0.2, random_state=42)
X_train_d, X_test_d, y_train_departure, y_test_departure = train_test_split(X, y_departure, test_size=0.2, random_state=42)

# Create DMatrix for XGBoost
dtrain_energy = xgb.DMatrix(X_train, label=y_train_energy)
dtest_energy = xgb.DMatrix(X_test, label=y_test_energy)

dtrain_departure = xgb.DMatrix(X_train_d, label=y_train_departure)
dtest_departure = xgb.DMatrix(X_test_d, label=y_test_departure)

# Define model parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'tree_method': 'gpu_hist'  # Use GPU for training
}

# Train XGBoost model for energy need prediction
xgb_energy_model = xgb.train(params, dtrain_energy, num_boost_round=100)
xgb_departure_model = xgb.train(params, dtrain_departure, num_boost_round=100)

# Save models
xgb_energy_model.save_model('models/xgboost/xgb_energy_model.json')
xgb_departure_model.save_model('models/xgboost/xgb_departure_model.json')

# Predictions
y_pred_energy = xgb_energy_model.predict(dtest_energy)
y_pred_departure = xgb_departure_model.predict(dtest_departure)

# Evaluate the models
mse_energy = mean_squared_error(y_test_energy, y_pred_energy)
mse_departure = mean_squared_error(y_test_departure, y_pred_departure)

r2_energy = r2_score(y_test_energy, y_pred_energy)
r2_departure = r2_score(y_test_departure, y_pred_departure)

print(f"Energy Need - MSE: {mse_energy}, R2: {r2_energy}")
print(f"Departure Time - MSE: {mse_departure}, R2: {r2_departure}")

# Save results
results_energy = {'MSE': mse_energy, 'R2': r2_energy}
results_departure = {'MSE': mse_departure, 'R2': r2_departure}

pd.DataFrame([results_energy]).to_parquet('results/xgboost_energy_results.parquet', index=False)
pd.DataFrame([results_departure]).to_parquet('results/xgboost_departure_results.parquet', index=False)
