In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Load Data ---
# We use '..' to go up one level from 'notebooks' to the root folder
data = pd.read_csv('../ride_sharing_data.csv')

# --- 2. Define Features (X) and Target (y) ---
X = data.drop('Price', axis=1)
y = data['Price']

# --- 3. Define Preprocessing Steps ---
numerical_features = ['Distance', 'Base_Price', 'Demand', 'Weather_Multiplier']
categorical_features = ['Time_of_Day', 'Weather']

# Filter out any columns that might not be in the data
all_features_in_data = list(X.columns)
numerical_features = [col for col in numerical_features if col in all_features_in_data]
categorical_features = [col for col in categorical_features if col in all_features_in_data]

# Create preprocessing 'pipelines' for each type
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Scale numbers to be on a similar level
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Convert text to numbers
])

# Combine these steps into one "preprocessor"
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- 4. Create the Full Model Pipeline ---
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# --- 5. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 6. Train the Model ---
print("Training model...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 7. Evaluate the Model ---
print("\nEvaluating model performance:")
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"  R-squared (R2): {r2:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")

# --- 8. See it in Action (Example Predictions) ---
print("\n--- Example Predictions ---")
sample_data = X_test.head(5)
sample_targets = y_test.head(5)
sample_predictions = model.predict(sample_data)

predictions_df = pd.DataFrame({
    'Actual Price': sample_targets,
    'Predicted Price': sample_predictions
})

print(predictions_df)

Training model...
Model training complete!

Evaluating model performance:
  R-squared (R2): 0.8869
  Root Mean Squared Error (RMSE): 59.2412

--- Example Predictions ---
     Actual Price  Predicted Price
521        158.56       153.747175
737        334.75       387.115280
740        142.17       133.631762
660        104.75        58.689492
411        863.70       661.442838
