In [1]:
# Config
import sys
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent if __name__ == "__main__" else Path.cwd().parent
sys.path.append(str(project_root))

import config
import utils
from scripts.feature_engineering import FeatureEngineer

# Load and clean data
train_df, test_df = utils.load_data()

# Preprocessing pipelines
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from xgboost.callback import EarlyStopping
import numpy as np
import pandas as pd

# Constants
TEST_SIZE = 0.2
RANDOM_SEED = 42

# Train / Val Split
X = train_df.drop(columns=['trip_id', 'travel_time'])
y = train_df['travel_time']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# RMSE helper function
rmse = lambda true, pred: np.sqrt(mean_squared_error(true, pred))

NameError: name '__file__' is not defined

In [None]:
    "# Generate submission file
",
# Load test data
test_df = pd.read_csv("../data/raw/Test.csv")

# Store trip_ids first before any preprocessing
trip_ids = test_df['trip_id'].copy()

# Load weather data for test set
weather_df = pd.read_csv("../data/raw/Accra_weather.csv")
weather_df['lcl_datetime'] = pd.to_datetime(weather_df['lcl_datetime'])
weather_df['weather_hour'] = weather_df['lcl_datetime'].dt.floor('H')

# Convert datetime columns in test data
test_df['lcl_start_transporting_dttm'] = pd.to_datetime(test_df['lcl_start_transporting_dt'])
test_df['trip_hour'] = test_df['lcl_start_transporting_dttm'].dt.floor('H')

# Extract time features
test_df['hour'] = test_df['lcl_start_transporting_dttm'].dt.hour
test_df['day_of_week'] = test_df['lcl_start_transporting_dttm'].dt.dayofweek
test_df['day_of_month'] = test_df['lcl_start_transporting_dttm'].dt.day
test_df['is_weekend'] = (test_df['day_of_week'] >= 5).astype(int)

# Define rush hours
def get_rush_hour(hour):
    if 7 <= hour <= 9 or 17 <= hour <= 19:
        return 1
    return 0

test_df['is_rush_hour'] = test_df['hour'].apply(get_rush_hour)

# Calculate haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    from math import radians, cos, sin, asin, sqrt
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Earth radius in kilometers
    return c * r

test_df['haversine_distance_km'] = test_df.apply(
    lambda row: haversine_distance(
        row['origin_lat'], row['origin_lon'], 
        row['destination_lat'], row['destination_lon']
    ), axis=1
)

# Merge with weather data
test_df = test_df.merge(
    weather_df[['weather_hour', 'precipitation_type', 'prev_hour_precipitation_mm', 'temperature_C']],
    left_on='trip_hour',
    right_on='weather_hour',
    how='left'
)

# Drop unnecessary columns and ensure same structure as training data
test_df = test_df.drop(columns=[c for c in DROP_COLS if c in test_df.columns], errors="ignore")

# Preprocess test data using the same preprocessor
test_enc = preprocessor.transform(test_df)

# Make predictions using the XGBoost model
test_predictions = xgb_model.predict(test_enc)

# Create submission DataFrame
submission = pd.DataFrame({
    "trip_id": trip_ids,
    "Target": test_predictions
})

# Save submission file
submission_path = "../outputs/submission.csv"
submission.to_csv(submission_path, index=False)
print(f"\nSubmission file saved to: {submission_path}")
print(f"Shape: {submission.shape}")
print("\nFirst few predictions:")
print(submission.head())

  weather_df['weather_hour'] = weather_df['lcl_datetime'].dt.floor('H')
  test_df['trip_hour'] = test_df['lcl_start_transporting_dttm'].dt.floor('H')


NotFittedError: need to call fit or load_model beforehand

In [None]:
# # ==============================================================
# #  Yango Accra Mobility – Step 2: Model Training (Apple‑Silicon)
# # ==============================================================

# import pandas as pd, numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OrdinalEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline, make_pipeline
# from sklearn.metrics import mean_squared_error
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression

# import lightgbm as lgb
# import xgboost as xgb

# # ----------------- Config ----------------------------------------------------
# RANDOM_SEED   = 42
# TARGET        = "Target"
# FILE_PATH     = "../data/raw/train_with_weather.csv"
# DROP_COLS     = ["trip_id", "ID"]             # drop if present
# TEST_SIZE     = 0.20

# # ----------------- 1. Load Dataset ------------------------------------------
# df = pd.read_csv(FILE_PATH)

# # ----------------- 2. Separate Target & Drop IDs ----------------------------
# y = df[TARGET]
# X = df.drop(columns=[TARGET] + [c for c in DROP_COLS if c in df.columns],
#             errors="ignore")

# # ----------------- 3. Column Groups -----------------------------------------
# cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
# num_cols = [c for c in X.columns if c not in cat_cols]

# # ----------------- 4. Preprocessor (Impute + Ordinal Encode) -----------------
# cat_pipe = make_pipeline(
#     SimpleImputer(strategy="constant", fill_value="missing"),
#     OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
# )
# num_pipe = SimpleImputer(strategy="median")

# pre = ColumnTransformer(
#     transformers=[
#         ("num", num_pipe, num_cols),
#         ("cat", cat_pipe, cat_cols)
#     ]
# )

# # ----------------- 5. Train / Validation Split ------------------------------
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
# )

# rmse = lambda t, p: np.sqrt(mean_squared_error(t, p))

# # ----------------- 6. Random Forest -----------------------------------------
# rf_pipe = Pipeline([
#     ("pre", pre),
#     ("model", RandomForestRegressor(
#         n_estimators=300,
#         n_jobs=-1,                 # use all CPU cores on M‑series
#         random_state=RANDOM_SEED))
# ])
# rf_pipe.fit(X_train, y_train)
# rf_rmse = rmse(y_val, rf_pipe.predict(X_val))
# print(f"Random Forest       RMSE: {rf_rmse:.4f}")

# # ----------------- 7. Linear Regression -------------------------------------
# lr_pipe = Pipeline([("pre", pre), ("model", LinearRegression())])
# lr_pipe.fit(X_train, y_train)
# lr_rmse = rmse(y_val, lr_pipe.predict(X_val))
# print(f"Linear Regression   RMSE: {lr_rmse:.4f}")

# # ----------------- 8. Prepare Encoded Matrices for Boosters ------------------
# X_train_enc = pre.fit_transform(X_train)
# X_val_enc   = pre.transform(X_val)

# # keep sparse matrices sparse for speed / memory
# if hasattr(X_train_enc, "toarray"):  # (it’s sparse if OneHot used; safe check)
#     X_train_enc = X_train_enc
#     X_val_enc   = X_val_enc

# # ----------------- 9. LightGBM (CPU) ----------------------------------------
# lgb_train = lgb.Dataset(X_train_enc, label=y_train)
# lgb_val   = lgb.Dataset(X_val_enc,   label=y_val, reference=lgb_train)

# lgb_params = dict(
#     objective      = "regression",
#     metric         = "rmse",
#     boosting_type  = "gbdt",
#     num_leaves     = 31,
#     learning_rate  = 0.05,
#     num_threads    = 0,        # 0 = all CPU cores
#     random_state   = RANDOM_SEED
# )

# lgb_model = lgb.train(
#     lgb_params,
#     lgb_train,
#     valid_sets        = [lgb_val],
#     num_boost_round   = 500,
#     early_stopping_rounds = 20,
#     verbose_eval      = False
# )
# lgb_rmse = rmse(y_val, lgb_model.predict(X_val_enc))
# print(f"LightGBM (CPU)     RMSE: {lgb_rmse:.4f}")

# # ----------------- 10. XGBoost (CPU‑hist) ------------------------------------
# xgb_model = xgb.XGBRegressor(
#     objective          = "reg:squarederror",
#     eval_metric        = "rmse",
#     tree_method        = "hist",      # fastest CPU algorithm
#     learning_rate      = 0.05,
#     n_estimators       = 1000,
#     max_depth          = 8,
#     enable_categorical = True,        # works with ordinal ints
#     n_jobs             = 0,           # all cores
#     random_state       = RANDOM_SEED
# )
# xgb_model.fit(
#     X_train_enc, y_train,
#     eval_set=[(X_val_enc, y_val)],
#     early_stopping_rounds=30,
#     verbose=False
# )
# xgb_rmse = rmse(y_val, xgb_model.predict(X_val_enc))
# print(f"XGBoost (CPU)      RMSE: {xgb_rmse:.4f}")

# # ----------------- 11. Summary ----------------------------------------------
# rmse_scores = {
#     "Random Forest"     : rf_rmse,
#     "Linear Regression" : lr_rmse,
#     "LightGBM (CPU)"    : lgb_rmse,
#     "XGBoost (CPU)"     : xgb_rmse
# }
# best_model = min(rmse_scores, key=rmse_scores.get)
# print("\nRMSE summary:", {k: f"{v:.4f}" for k, v in rmse_scores.items()})
# print(f"Best model: {best_model}")


Random Forest       RMSE: 4.4665
Linear Regression   RMSE: 12.2044
Linear Regression   RMSE: 12.2044


TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

<!-- # Model Training Notebook

This notebook is dedicated to training machine learning models for the Yango Accra Mobility Prediction Hackathon. The goal is to predict ride times using trip and weather data.

**Key Steps:**
1. Import required libraries
2. Load and preprocess data
3. Train-test split
4. Train baseline models
5. Train advanced models (e.g., LightGBM, XGBoost)
6. Evaluate models using RMSE
7. Save the best model -->

In [None]:
# # Import required libraries
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# import lightgbm as lgb
# import xgboost as xgb
# import joblib

# print("Libraries imported successfully!")

Libraries imported successfully!


In [None]:
best_model = min(rmse_scores, key=rmse_scores.get)
print(f"Best model: {best_model}")
print(f"Best RMSE: {rmse_scores[best_model]:.4f}")

# Save the best model
best_model_obj = None
if best_model == "Random Forest":
    best_model_obj = rf_model
elif best_model == "Linear Regression": 
    best_model_obj = lr_model
elif best_model == "LightGBM (CPU)":
    best_model_obj = lgb_model
elif best_model == "XGBoost (CPU)":
    best_model_obj = xgb_model

Dataset loaded successfully!
Dataset shape: (57596, 21)
         trip_id  destination_lat  destination_lon lcl_start_transporting_dt  \
0  ID_S3BD1V9G53         5.630927        -0.169211                2024-05-05   
1  ID_ZJM7LMN65Q         5.645044        -0.156482                2024-05-21   
2  ID_SZ3BP6V01V         5.711156        -0.141063                2024-05-05   
3  ID_5IPHXDCMKF         5.677497        -0.183350                2024-05-26   
4  ID_BYZEJ0B5RA         5.601700        -0.173589                2024-05-30   

  lcl_start_transporting_dttm  origin_lat  origin_lon  str_distance_km  \
0         2024-05-05 09:56:32    5.630979   -0.164760            0.529   
1         2024-05-21 10:53:32    5.686892   -0.118931            6.230   
2         2024-05-05 21:21:21    5.706008   -0.164999            2.705   
3         2024-05-26 21:23:33    5.665943   -0.182602            1.236   
4         2024-05-30 14:02:13    5.565401   -0.160919            4.312   

   transporting_di

In [None]:
# # Preprocess data
# # Ensure all features are numeric and handle missing values
# data = data.select_dtypes(include=[np.number]).dropna()

# # Define target and features
# target = 'Target'
# features = [col for col in data.columns if col != target]

# X = data[features]
# y = data[target]

# print("Data preprocessing completed!")
# print(f"Features: {features}")

Data preprocessing completed!
Features: ['destination_lat', 'destination_lon', 'origin_lat', 'origin_lon', 'str_distance_km', 'transporting_distance_fact_km', 'hour', 'day_of_week', 'day_of_month', 'is_weekend', 'is_rush_hour', 'haversine_distance_km', 'prev_hour_precipitation_mm', 'temperature_C']


In [None]:
# Create submission file
test_predictions = best_model_obj.predict(X_test_enc)
submission = pd.DataFrame({
    'trip_id': test_df['trip_id'],
    'travel_time': test_predictions
})

# Save submission file
submission_path = config.PROJECT_ROOT / "submission.csv"
submission.to_csv(submission_path, index=False)

print(f"\nSubmission file saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print("\nFirst 5 predictions:")
print(submission.head())

Train-test split completed!
Training set size: 46071 samples
Test set size: 11518 samples


In [None]:
# # Train baseline models
# # Linear Regression
# lr_model = LinearRegression()
# lr_model.fit(X_train, y_train)
# lr_predictions = lr_model.predict(X_test)
# lr_mse = mean_squared_error(y_test, lr_predictions)
# lr_rmse = np.sqrt(lr_mse)

# print(f"Linear Regression RMSE: {lr_rmse:.2f}")

# # Random Fores\t
# rf_model = RandomForestRegressor(random_state=42)
# rf_model.fit(X_train, y_train)
# rf_predictions = rf_model.predict(X_test)
# rf_mse = mean_squared_error(y_test, rf_predictions)
# rf_rmse = np.sqrt(rf_mse)

# print(f"Random Forest RMSE: {rf_rmse:.2f}")

Linear Regression RMSE: 5.63

Random Forest RMSE: 6.07
Random Forest RMSE: 6.07


In [None]:
# # # Train advanced models
# # # LightGBM
# lgb_train = lgb.Dataset(X_train, label=y_train)
# lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

# lgb_params = {
#     'objective': 'regression',
#     'metric': 'rmse',
#     'boosting_type': 'gbdt',
#     'learning_rate': 0.1,
#     'num_leaves': 31,
#     'random_state': 42,
#     'verbosity': -1
# }

# # Train LightGBM model
# evals_result = {}
# lgb_model = lgb.train(
#     params=lgb_params,
#     train_set=lgb_train,
#     valid_sets=[lgb_train, lgb_test],
#     num_boost_round=100,
#     callbacks=[
#         lgb.callback.early_stopping(stopping_rounds=20),
#         lgb.callback.record_evaluation(evals_result)
#     ]
# )

# lgb_predictions = lgb_model.predict(X_test)
# lgb_mse = mean_squared_error(y_test, lgb_predictions)
# lgb_rmse = np.sqrt(lgb_mse)

# print(f"LightGBM RMSE: {lgb_rmse:.2f}")

# # # XGBoost
# xgb_model = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     random_state=42,
#     n_estimators=100,
#     learning_rate=0.1
# )
# xgb_model.fit(
#     X_train,
#     y_train,
#     eval_set=[(X_test, y_test)],
#     early_stopping_rounds=20,
#     verbose=False
# )
# xgb_predictions = xgb_model.predict(X_test)
# xgb_mse = mean_squared_error(y_test, xgb_predictions)
# xgb_rmse = np.sqrt(xgb_mse)

# print(f"XGBoost RMSE: {xgb_rmse:.2f}")

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: lcl_start_transporting_dt: object, lcl_start_transporting_dttm: object, trip_hour: object, weather_hour: object, precipitation_type: object

In [None]:
# # Save the best model
# best_model = min([(lr_model, lr_rmse), (rf_model, rf_rmse), (lgb_model, lgb_rmse), (xgb_model, xgb_rmse)], key=lambda x: x[1])[0]
# joblib.dump(best_model, '../outputs/best_model.pkl')

# print("Best model saved successfully!")

In [None]:
# # Generate submission file
# submission = pd.DataFrame({
#     'trip_id': X_test.index,
#     'Target': xgb_predictions
# })

# submission_file_path = '../outputs/submission.csv'
# submission.to_csv(submission_file_path, index=False)

# print(f"Submission file saved to {submission_file_path}")