In [1]:
# Config
import sys
from pathlib import Path
import importlib

# Add project root to path (notebook-friendly approach)
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import and reload modules to ensure fresh imports
import config
import utils
from scripts.feature_engineering import FeatureEngineer

# Reload modules to pick up any changes
importlib.reload(config)
importlib.reload(utils)

# Verify paths are correct
print(f"Data directory: {config.DATA_DIR}")
print(f"Train file path: {config.TRAIN_FILE}")
print(f"File exists: {config.TRAIN_FILE.exists()}")

# Load and clean data
train_df, test_df,weather_df, sample_submission = utils.load_data()

# Preprocessing pipelines
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from xgboost.callback import EarlyStopping
import numpy as np
import pandas as pd

# Constants
TEST_SIZE = 0.2
RANDOM_SEED = 42

# Train / Val Split - using correct target column name
X = train_df.drop(columns=['trip_id', 'Target'])  # Fixed: Target instead of travel_time
y = train_df['Target']  # Fixed: Target instead of travel_time
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# RMSE helper function
rmse = lambda true, pred: np.sqrt(mean_squared_error(true, pred))

print(f"Data loaded successfully!")
print(f"Training features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature columns: {list(X.columns)}")

Data directory: /Users/brunellaquaye/Documents/yango-accra-mobility-prediction/data/raw
Train file path: /Users/brunellaquaye/Documents/yango-accra-mobility-prediction/data/raw/Train.csv
File exists: True
Data loaded successfully!
Train: (57596, 10), Test: (24684, 9)
Weather: (744, 5), Sample: (24684, 2)
Data loaded successfully!
Training features shape: (57596, 8)
Target shape: (57596,)
Feature columns: ['destination_lat', 'destination_lon', 'lcl_start_transporting_dt', 'lcl_start_transporting_dttm', 'origin_lat', 'origin_lon', 'str_distance_km', 'transporting_distance_fact_km']
Data loaded successfully!
Training features shape: (57596, 8)
Target shape: (57596,)
Feature columns: ['destination_lat', 'destination_lon', 'lcl_start_transporting_dt', 'lcl_start_transporting_dttm', 'origin_lat', 'origin_lon', 'str_distance_km', 'transporting_distance_fact_km']


In [2]:
# Column groups for preprocessing
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# Preprocessing pipelines
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])
num_pipe = SimpleImputer(strategy="median")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
)

# Encode training and validation data
X_train_enc = preprocessor.fit_transform(X_train)
X_val_enc = preprocessor.transform(X_val)

print(f"Training data shape: {X_train_enc.shape}")
print(f"Validation data shape: {X_val_enc.shape}")
print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")

Training data shape: (46076, 8)
Validation data shape: (11520, 8)
Categorical columns: ['lcl_start_transporting_dt', 'lcl_start_transporting_dttm']
Numerical columns: ['destination_lat', 'destination_lon', 'origin_lat', 'origin_lon', 'str_distance_km', 'transporting_distance_fact_km']


In [3]:
# Random Forest Model
rf_model = RandomForestRegressor(
    n_estimators=300,
    random_state=RANDOM_SEED,
    n_jobs=-1
)
rf_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", rf_model)
])
rf_pipe.fit(X_train, y_train)
rf_rmse = rmse(y_val, rf_pipe.predict(X_val))
print(f"Random Forest       RMSE: {rf_rmse:.4f}")

# Linear Regression Model
lr_model = LinearRegression()
lr_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", lr_model)
])
lr_pipe.fit(X_train, y_train)
lr_rmse = rmse(y_val, lr_pipe.predict(X_val))
print(f"Linear Regression   RMSE: {lr_rmse:.4f}")

Random Forest       RMSE: 4.6367
Linear Regression   RMSE: 19.2243


In [4]:
# LightGBM Model
lgb_train = lgb.Dataset(X_train_enc, label=y_train)
lgb_val = lgb.Dataset(X_val_enc, label=y_val, reference=lgb_train)

lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "num_threads": 0,
    "random_state": RANDOM_SEED
}

lgb_model = lgb.train(
    lgb_params,
    train_set = lgb_train,
    valid_sets = [lgb_val],
    num_boost_round = 500,
    callbacks = [
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=10)
    ]
)
lgb_rmse = rmse(y_val, lgb_model.predict(X_val_enc, num_iteration=lgb_model.best_iteration))
print(f"LightGBM (CPU)      RMSE: {lgb_rmse:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1817
[LightGBM] [Info] Number of data points in the train set: 46076, number of used features: 8
[LightGBM] [Info] Start training from score 10.095750
Training until validation scores don't improve for 20 rounds
[10]	valid_0's rmse: 6.30884
[20]	valid_0's rmse: 5.29908
[30]	valid_0's rmse: 4.93001
[40]	valid_0's rmse: 4.79448
[50]	valid_0's rmse: 4.7308
[60]	valid_0's rmse: 4.7382
[70]	valid_0's rmse: 4.74863
Early stopping, best iteration is:
[56]	valid_0's rmse: 4.72548
LightGBM (CPU)      RMSE: 4.7255
[40]	valid_0's rmse: 4.79448
[50]	valid_0's rmse: 4.7308
[60]	valid_0's rmse: 4.7382
[70]	valid_0's rmse: 4.74863
Early stopping, best iteration is:
[56]	valid_0's rmse: 4.72548
LightGBM (CPU)      RMSE: 4.7255


In [None]:
# XGBoost Model
dtrain = xgb.DMatrix(X_train_enc, label=y_train)
dval = xgb.DMatrix(X_val_enc, label=y_val)



# Include the early_stopping_round in the method and not the model.fit due to the version of xgb model being used
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    n_jobs=0,
    random_state=RANDOM_SEED,
    early_stopping_rounds=30,
    eval_metric="rmse"
)

# Fit without callbacks parameter
xgb_model.fit(
    X_train_enc,
    y_train,
    eval_set=[(X_val_enc, y_val)],
    verbose=False
)



xgb_rmse = rmse(y_val, xgb_model.predict(X_val_enc))
print(f"XGBoost (CPU)       RMSE: {xgb_rmse:.4f}")

# Model comparison
rmse_scores = {
    "Random Forest": rf_rmse,
    "Linear Regression": lr_rmse,
    "LightGBM (CPU)": lgb_rmse,
    "XGBoost (CPU)": xgb_rmse
}

print("\nRMSE summary:", {k: f"{v:.4f}" for k, v in rmse_scores.items()})

XGBoost (CPU)       RMSE: 4.8375

RMSE summary: {'Random Forest': '4.6367', 'Linear Regression': '19.2243', 'LightGBM (CPU)': '4.7255', 'XGBoost (CPU)': '4.8375'}


<!-- # Model Training Notebook

This notebook is dedicated to training machine learning models for the Yango Accra Mobility Prediction Hackathon. The goal is to predict ride times using trip and weather data.

**Key Steps:**
1. Import required libraries
2. Load and preprocess data
3. Train-test split
4. Train baseline models
5. Train advanced models (e.g., LightGBM, XGBoost)
6. Evaluate models using RMSE
7. Save the best model -->

In [9]:
best_model = min(rmse_scores, key=rmse_scores.get)
print(f"Best model: {best_model}")
print(f"Best RMSE: {rmse_scores[best_model]:.4f}")

# Save the best model - use the correct model objects
best_model_obj = None
if best_model == "Random Forest":
    best_model_obj = rf_pipe  # Use the pipeline, not just rf_model
elif best_model == "Linear Regression": 
    best_model_obj = lr_pipe  # Use the pipeline, not just lr_model
elif best_model == "LightGBM (CPU)":
    best_model_obj = lgb_model
elif best_model == "XGBoost (CPU)":
    best_model_obj = xgb_model

print(f"Selected model object: {type(best_model_obj)}")

Best model: Random Forest
Best RMSE: 4.6367
Selected model object: <class 'sklearn.pipeline.Pipeline'>


In [10]:
# Generate submission file using the best model
# Prepare test data
X_test = test_df.drop(columns=['trip_id'])

# Make predictions - handle both pipeline and direct models
if best_model in ["Random Forest", "Linear Regression"]:
    # For pipeline models (RF and LR), use raw data
    test_predictions = best_model_obj.predict(X_test)
else:
    # For direct models (LightGBM and XGBoost), use encoded data
    X_test_enc = preprocessor.transform(X_test)
    if best_model == "LightGBM (CPU)":
        test_predictions = best_model_obj.predict(X_test_enc, num_iteration=lgb_model.best_iteration)
    else:  # XGBoost
        test_predictions = best_model_obj.predict(X_test_enc)

# Create submission file
submission = pd.DataFrame({
    'trip_id': test_df['trip_id'],
    'travel_time': test_predictions
})

# Save submission file
submission_path = config.PROJECT_ROOT / "outputs" / "submission.csv"
submission.to_csv(submission_path, index=False)

print(f"\nSubmission file saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print("\nFirst 5 predictions:")
print(submission.head())


Submission file saved to: /Users/brunellaquaye/Documents/yango-accra-mobility-prediction/outputs/submission.csv
Submission shape: (24684, 2)

First 5 predictions:
         trip_id  travel_time
0  ID_PV4QVE2H2X     4.582467
1  ID_SUOBMO2E7V     7.763400
2  ID_Q5KSL38U9B    15.543367
3  ID_1G08NWYA35    27.140967
4  ID_H7IZ8JL8YT     8.282500
