<a href="https://colab.research.google.com/github/atoholj/used-car-price-analysis/blob/main/02_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ==============================
# Used Cars - Modeling
# ==============================

# --- Step 1: Upload the cleaned dataset (vehicles_clean.csv) ---
from google.colab import files
print("Upload vehicles_clean.csv (downloaded from Notebook 1)…")
uploaded = files.upload()  # select vehicles_clean.csv from your computer

# --- Step 2: Load cleaned data ---
import pandas as pd, numpy as np
from pathlib import Path

df = pd.read_csv("vehicles_clean.csv")
print("Data shape:", df.shape)
df.head(3)

# --- Step 3: Prepare features and target ---
target = "price"
y = df[target]

use_cols = []
for c in ["year","odometer","age","miles_per_year","brand_tier",
          "condition_bucket","fuel","transmission","drive","type","state"]:
    if c in df.columns: use_cols.append(c)

X = df[use_cols].copy()
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()

print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

# --- Step 4: Train/test split + pipeline ---
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pre = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])


model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
pipe = Pipeline([("pre", pre), ("mdl", model)])
pipe.fit(X_train, y_train)

# --- Step 5: Evaluate model ---
preds = pipe.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)

print(f"Holdout MAE:  ${mae:,.0f}")
print(f"Holdout RMSE: ${rmse:,.0f}")
print(f"Holdout R²:   {r2:.3f}")

# --- Step 6: Permutation importance ---
from sklearn.inspection import permutation_importance
r = permutation_importance(pipe, X_test, y_test, n_repeats=3, random_state=42, n_jobs=-1)

feat_names = pipe.named_steps["pre"].get_feature_names_out()
importances = pd.Series(r.importances_mean, index=feat_names).sort_values(ascending=False)

print("\nTop 20 important features:")
print(importances.head(20))

# --- Step 7: Save artifacts ---
import joblib
joblib.dump(pipe, "price_model.joblib")
importances.head(20).to_csv("top_permutation_importances.csv")
print("\nSaved: price_model.joblib, top_permutation_importances.csv")

# --- Step 8: Download artifacts back to your computer (optional) ---
files.download("price_model.joblib")
files.download("top_permutation_importances.csv")


Upload vehicles_clean.csv (downloaded from Notebook 1)…


Saving vehicles_clean.csv to vehicles_clean (1).csv
Data shape: (371009, 22)
Numerical columns: ['year', 'odometer', 'age', 'miles_per_year']
Categorical columns: ['brand_tier', 'condition_bucket', 'fuel', 'transmission', 'drive', 'type', 'state']


KeyboardInterrupt: 