# Step 1: Load & Explore Dataset

In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("vehicles.csv")  

print(df.shape)   
print(df.head())


(426880, 26)
           id                                                url  \
0  7222695916  https://prescott.craigslist.org/cto/d/prescott...   
1  7218891961  https://fayar.craigslist.org/ctd/d/bentonville...   
2  7221797935  https://keys.craigslist.org/cto/d/summerland-k...   
3  7222270760  https://worcester.craigslist.org/cto/d/west-br...   
4  7210384030  https://greensboro.craigslist.org/cto/d/trinit...   

                   region                         region_url  price  year  \
0                prescott    https://prescott.craigslist.org   6000   NaN   
1            fayetteville       https://fayar.craigslist.org  11900   NaN   
2            florida keys        https://keys.craigslist.org  21000   NaN   
3  worcester / central MA   https://worcester.craigslist.org   1500   NaN   
4              greensboro  https://greensboro.craigslist.org   4900   NaN   

  manufacturer model condition cylinders  ... size  type paint_color  \
0          NaN   NaN       NaN       NaN  .

# Step 2: Keep Useful Columns

In [5]:
# Select important columns
df = df[["price", "year", "manufacturer", "model", "fuel", "odometer", "transmission"]]

# Drop rows with missing values
df = df.dropna()


# Step 3: Filter Outliers

In [6]:
# Keep only reasonable prices
df = df[(df["price"] > 1000) & (df["price"] < 100000)]

# Keep odometer within range
df = df[(df["odometer"] > 0) & (df["odometer"] < 300000)]


# Step 4: Take a Sample of 200,000 Rows

In [7]:
df_sample = df.sample(200000, random_state=42)
print(df_sample.shape)  


(200000, 7)


# Step 5: Encode Categorical Features

In [8]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ["manufacturer", "model", "fuel", "transmission"]
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_sample[col] = le.fit_transform(df_sample[col])
    label_encoders[col] = le


# Step 6: Train/Test Split & Model Training

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pickle

# Features and Target
X = df_sample.drop("price", axis=1)
y = df_sample["price"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Evaluation:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


# Save model and features

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("features.pkl", "wb") as f:
    pickle.dump(list(X.columns), f)

print("Model and feature names saved (model.pkl, features.pkl)")


Model Evaluation:
MAE: 2001.31
RMSE: 4272.86
R² Score: 0.9079
Model and feature names saved (model.pkl, features.pkl)


# Future work: using the xgboost model 