In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Load Data ---
data = pd.read_csv('../ride_sharing_data.csv')

# --- 2. Define Features (X) and Target (y) ---
# THIS IS THE KEY CHANGE!
# y is now 'Demand'
# 'Price' is now a feature in X
X = data.drop('Demand', axis=1)
y = data['Demand']

# --- 3. Define Preprocessing Steps ---
# 'Price' is now included in our numerical_features
numerical_features = ['Price', 'Distance', 'Base_Price', 'Weather_Multiplier']
categorical_features = ['Time_of_Day', 'Weather']

# Filter out any columns that might not be in the data
all_features_in_data = list(X.columns)
numerical_features = [col for col in numerical_features if col in all_features_in_data]
categorical_features = [col for col in categorical_features if col in all_features_in_data]

# Create preprocessing 'pipelines' for each type
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine these steps into one "preprocessor"
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' # Drop any columns we didn't explicitly list
)

# --- 4. Create the Full Model Pipeline ---
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# --- 5. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 6. Train the Demand Model ---
print("Training DEMAND model...")
model.fit(X_train, y_train)
print("Demand model training complete!")

# --- 7. Evaluate the Demand Model ---
print("\nEvaluating demand model performance:")
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"  R-squared (R2): {r2:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")

# --- 8. Also, let's look at the stats for Demand ---
print("\n--- Demand Data Stats ---")
print(y_train.describe())

Training DEMAND model...
Demand model training complete!

Evaluating demand model performance:
  R-squared (R2): 0.4050
  Root Mean Squared Error (RMSE): 1.9832

--- Demand Data Stats ---
count    800.000000
mean       5.462456
std        2.577378
min        1.014086
25%        3.281818
50%        5.397125
75%        7.637072
max        9.985128
Name: Demand, dtype: float64
