In [4]:
# Bootstrap: set repo root, show env, & auto-reload src/ code
import os, pathlib, sys, pandas as pd

# Ensure CWD is the repo root (parent of notebooks/)
if pathlib.Path().resolve().name == "notebooks":
    os.chdir("..")
print("CWD:", os.getcwd())
print("Python:", sys.version.split()[0], "| pandas:", pd.__version__)

# Auto-reload edited modules in src/ without full kernel restarts
%load_ext autoreload
%autoreload 2

CWD: /Users/abdulrahmanaboluhom/Documents/GitHub/Projects/indepth-driving-data-analysis
Python: 3.9.6 | pandas: 2.3.1
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from pathlib import Path
from src.ingest import load_journey_event_with_config
from src.validate import validate_journey_event

In [6]:
CFG = {
    "sheet_name": "Journey_Event_Sample",
    "header": "auto",
    "usecols": "B:BC",
    "nrows": 400,  # remove or set to None when loading full data
    "date_cols": ["RTC Date Time","GPS Date Time","Event Time Stamp"],
    "dayfirst": True,
    "keep_cols": [
        "Policy Number","Voucher ID","Journey ID",
        "Event Time Stamp","GPS Date Time",
        "Latitude","Longitude",
        "Event Type","Horizontal Speed","Road Speed Limit",
        "Accumulated Trip Distance","Accumulated Trip Idle Time","Accumulated Trip Run Time",
    ],
    "validate": True,  # run P-T-R on the keep set
}
file = "data/raw/driving_sample.xlsx"

In [7]:
df_keep = load_journey_event_with_config(
    xlsx_path=file,
    cfg=CFG,
    write_interim=Path("data/interim/journey_event_sample_keep.parquet")
)

In [9]:
print("shape:", df_keep.shape)
print("columns:", list(df_keep.columns)[:8], "…")
print(df_keep[["Event Time Stamp","Latitude","Longitude"]].dtypes)
from src.validate import validate_journey_event
validate_journey_event(df_keep)
print("✅ schema ok")

shape: (384, 13)
columns: ['Policy Number', 'Voucher ID', 'Journey ID', 'Event Time Stamp', 'GPS Date Time', 'Latitude', 'Longitude', 'Event Type'] …
Event Time Stamp    datetime64[ns]
Latitude                   float64
Longitude                  float64
dtype: object
✅ schema ok


# Exploration

# Dev helpers (Archive)

In [None]:
from src.ingest import load_journey_event_with_config

CFG = {
    "sheet_name": "Journey_Event_Sample",
    "header": "auto",
    "usecols": "B:BC",
    "nrows": 400,  # sample for speed; remove later for full load
    "date_cols": ["RTC Date Time","GPS Date Time","Event Time Stamp"],
    "dayfirst": True,
    "keep_cols": [
        "Policy Number","Voucher ID","Journey ID",
        "Event Time Stamp","GPS Date Time",
        "Latitude","Longitude",
        "Event Type","Horizontal Speed","Road Speed Limit",
        "Accumulated Trip Distance","Accumulated Trip Idle Time","Accumulated Trip Run Time",
    ],
    "validate": True,  # run P‑T‑R on the keep set
}

file = "data/raw/driving_sample.xlsx"

df_keep = load_journey_event_with_config(
    xlsx_path=file,
    cfg=CFG,
    write_interim=Path("data/interim/journey_event_sample_keep.parquet")  # or None if you don’t want to write
)

df_keep.head(2), df_keep.shape

In [None]:
import importlib, types

import src.ingest as ingest
importlib.reload(ingest)     # picks up latest file contents

print("Attributes on ingest:", [a for a in dir(ingest) if not a.startswith("_")])

In [None]:
from src.ingest import load_journey_event

In [None]:
# testing function
# import the loader
from src.ingest import load_journey_event

file = "data/raw/driving_sample.xlsx"
sheet = "Journey_Event_Sample"

df = load_journey_event(
    xlsx_path=file,
    sheet_name=sheet,
    header="auto",                      # or 0 if you want to force it
    usecols="B:BC",                     # skip blank col A
    nrows=400,                          # sample size for speed
    date_cols=["RTC Date Time","GPS Date Time","Event Time Stamp"],
    dayfirst=True
)

df.head(2), df.shape

In [None]:
#Data path & sheet list (Code)

from pathlib import Path
DATA = Path("data/raw")
file = DATA / "driving_sample.xlsx"  # <-- change to actual filename
print("File exists:", file.exists())

xls = pd.ExcelFile(file)
xls.sheet_names  # check the exact messy sheet name here

In [None]:
print("shape:", df.shape)
print("first 12 columns:", df.columns[:12].tolist())

# QUICK: are the three date columns even present?
date_cols = ["RTC Date Time", "GPS Date Time", "Event Time Stamp"]
present = [c for c in date_cols if c in df.columns]
missing = [c for c in date_cols if c not in df.columns]
print("date cols present:", present, "| missing:", missing)

# show dtypes of a few key columns
cols_probe = present + [c for c in ["Event Type","Latitude","Longitude","Horizontal Speed","Road Speed Limit"] if c in df.columns]
df[cols_probe].dtypes

In [None]:
# column auditing

rows = []
for col in df.columns:
    s = df[col]
    rows.append({
        "column": col,
        "dtype": str(s.dtype),
        "null_pct": round(100 * (1 - s.notna().mean()), 2),
        "n_unique": int(s.nunique(dropna=True)),
    })
audit = pd.DataFrame(rows).sort_values(["dtype","column"]).reset_index(drop=True)
audit.head(15)

In [None]:
probe = ["Event Type","Delta Trip Distance","Direction","Altitude","GPS Accuracy"]
[c for c in probe if c in df.columns], df[probe].head(2)