# CO₂ Emissions Prediction with XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np

# Load dataset
df = pd.read_csv('../data/raw/certificates.csv')

# Select features and target
features = [
    'PROPERTY_TYPE', 'BUILT_FORM', 'MAIN_FUEL',
    'TOTAL_FLOOR_AREA', 'ENERGY_CONSUMPTION_CURRENT',
    'CONSTRUCTION_AGE_BAND', 'WALLS_ENERGY_EFF',
    'WINDOWS_ENERGY_EFF', 'ROOF_ENERGY_EFF',
    'HOT_WATER_ENERGY_EFF', 'SHEATING_ENV_EFF',
    'LIGHTING_ENERGY_EFF', 'MAINS_GAS_FLAG',
    'SOLAR_WATER_HEATING_FLAG'
]

df = df[features + ['CO2_EMISSIONS_CURRENT']].dropna()

X = df[features]
y = df['CO2_EMISSIONS_CURRENT']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor
categorical = X.select_dtypes(include='object').columns.tolist()
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
], remainder='passthrough')

# Pipeline
model = Pipeline([
    ('prep', preprocessor),
    ('xgb', XGBRegressor(objective='reg:squarederror'))
])

# Train
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print(f'MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R² Score: {r2:.2f}')