In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import numpy as np
import pandas as pd

In [None]:
from sklearn.preprocessing import FunctionTransformer
# Load the dataset
df = pd.read_csv('store_data.csv')
# Group low-frequency cities into an "Other" category
top_cities = df_cleaned['city'].value_counts().nlargest(50).index  # Keep top 50 cities
df_cleaned['city'] = df_cleaned['city'].apply(lambda x: x if x in top_cities else 'Other')

# Proceed with the pipeline
selected_features = ['store_area', 'footfall', 'avg_temperature', 'city', 
                     'precipitation_mm', 'air_pressure_hpa', 'wind_speed_kmh']

# Extract the relevant features and target
X = df_cleaned[selected_features]
y = df_cleaned['turnover']

# Preprocessing: Handle missing values and encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), ['store_area', 'footfall', 'avg_temperature', 
                                                 'precipitation_mm', 'air_pressure_hpa', 
                                                 'wind_speed_kmh']),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=True), ['city'])
    ]
)

# Define the XGBoost model
xgboost_model = xgb.XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=6)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgboost_model)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"XGBoost Model Performance:\nMAE: {mae:.2f}\nRMSE: {rmse:.2f}\nR² (Accuracy): {r2:.2f}")
