In [None]:
# 01_exploration_and_cleaning.ipynb

from pathlib import Path
import sys

import pandas as pd
import matplotlib.pyplot as plt

# Make plots show inline in the notebook
%matplotlib inline

# --- Make sure we can import from src/ ---

# Current working directory should be .../cs439-final-project/notebooks
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent   # .../cs439-final-project

# Add project root to Python path so `src` is importable
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.cleaning import load_and_clean_epa, load_and_clean_sports
from src.aggregates import compute_epa_yearly, compute_sports_yearly

PROJECT_ROOT, NOTEBOOK_DIR

In [None]:
# Paths to raw data (relative to project root)
epa_raw_path = PROJECT_ROOT / "data" / "raw" / "all-vehicles-model.csv"
sports_raw_path = PROJECT_ROOT / "data" / "raw" / "Sport car price.csv"

print("EPA raw path:    ", epa_raw_path)
print("Sports raw path: ", sports_raw_path)

# Load + clean using our src/cleaning.py helpers
epa_clean = load_and_clean_epa(epa_raw_path, year_min=2000, year_max=2025)
sports_clean = load_and_clean_sports(sports_raw_path, year_min=2000, year_max=2025)

epa_clean.shape, sports_clean.shape


In [None]:
# Peek at EPA data
display(epa_clean.head())
display(epa_clean.describe(include="all").T.head())

print("\nEPA years:", sorted(epa_clean["Year"].unique())[:5], "...", sorted(epa_clean["Year"].unique())[-5:])

# Peek at sports car data
display(sports_clean.head())
display(sports_clean.describe(include="all").T.head())

print("\nSports years:", sorted(sports_clean["Year"].unique()))


In [None]:
epa_yearly = compute_epa_yearly(epa_clean)
sports_yearly = compute_sports_yearly(sports_clean)

display(epa_yearly.head())
display(epa_yearly.tail())

display(sports_yearly.head())
display(sports_yearly.tail())


In [None]:
# --- EPA: fuel economy & CO2 over time ---

plt.figure()
plt.plot(epa_yearly["Year"], epa_yearly["Combined Mpg For Fuel Type1"])
plt.xlabel("Year")
plt.ylabel("Average Combined MPG")
plt.title("Average Combined MPG (EPA Vehicles) by Year")
plt.grid(True)
plt.show()

plt.figure()
plt.plot(epa_yearly["Year"], epa_yearly["Co2  Tailpipe For Fuel Type1"])
plt.xlabel("Year")
plt.ylabel("Average CO₂ Tailpipe Emissions (g/mile)")
plt.title("Average CO₂ Tailpipe Emissions by Year")
plt.grid(True)
plt.show()


In [None]:
# --- Sports cars: horsepower & 0–60 over time ---

plt.figure()
plt.plot(sports_yearly["Year"], sports_yearly["Horsepower"])
plt.xlabel("Year")
plt.ylabel("Average Horsepower")
plt.title("Average Sports Car Horsepower by Year")
plt.grid(True)
plt.show()

plt.figure()
plt.plot(sports_yearly["Year"], sports_yearly["0-60 MPH Time (seconds)"])
plt.xlabel("Year")
plt.ylabel("Average 0–60 Time (s)")
plt.title("Average Sports Car 0–60 Time by Year")
plt.grid(True)
plt.show()
