In [2]:
# Adam Robles
# CS4680 - Assignment 1: Machine Learning Exercise 

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option("display.max_columns", 200)

# Load the dataset
df = pd.read_csv("car_sales_data.csv")

# Convert strings with numbers (like "$12,345" or "150,000 mi") into floats
def to_float(x):
    if pd.isna(x):
        return np.nan
    s = re.sub(r"[^0-9.\-]", "", str(x))
    return float(s) if s not in ("", ".", "-") else np.nan

# Apply conversion to numeric columns
num_cols = ["Engine size", "Year of manufacture", "Mileage", "Price"]
for c in num_cols:
    df[c] = df[c].apply(to_float)

# Clean text columns and fill missing values
cat_cols = ["Manufacturer", "Model", "Fuel type"]
for c in cat_cols:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})
df[cat_cols] = df[cat_cols].fillna("Unknown")

# Drop rows without price and fill missing numbers with the median
df = df.dropna(subset=["Price"]).reset_index(drop=True)
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

# Create a simple estimated price formula based on medians
med_price  = df["Price"].median()
med_engine = df["Engine size"].median()
med_year   = df["Year of manufacture"].median()
med_miles  = df["Mileage"].median()

def naive_price(row):
    base = med_price
    base += 2000 * (row["Engine size"] - med_engine)
    base +=  800 * (row["Year of manufacture"] - med_year)
    base += -500 * ((row["Mileage"] - med_miles) / 10000.0)
    return base

df["Estimated price"] = df.apply(naive_price, axis=1)
df["Estimated price"] = df["Estimated price"].clip(lower=0)

# Mark deals as good or bad depending on $5,000 tolerance
diff = (df["Price"] - df["Estimated price"]).abs()
df["Good or bad deal"] = np.where(diff > 5000, "Bad deal", "Good deal")

# Select features and target for machine learning
feature_cols = ["Manufacturer", "Model", "Engine size", "Fuel type", "Year of manufacture", "Mileage"]
X = df[feature_cols].copy()
y = df["Price"].copy()

# Convert categories into dummy variables
X_enc = pd.get_dummies(X, columns=["Manufacturer", "Model", "Fuel type"], drop_first=True)

# Split into training and test sets
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_enc, y, df.index, test_size=0.2, random_state=42
)

# Train linear regression and random forest models
lin = LinearRegression().fit(X_train, y_train)
rf  = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(X_train, y_train)

# Make predictions with both models and the naive estimate
pred_lin = lin.predict(X_test)
pred_rf  = rf.predict(X_test)
pred_naive = df.loc[idx_test, "Estimated price"].values

# Define metric functions and print results
def rmse_compat(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

def report_metrics(name, y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = rmse_compat(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    print(f"{name:14s} | MAE: {mae:,.0f} | RMSE: {rmse:,.0f} | R²: {r2:0.3f}")

print("\n=== Test-set price prediction metrics (lower is better) ===")
report_metrics("Naive baseline", y_test, pred_naive)
report_metrics("LinearReg",      y_test, pred_lin)
report_metrics("RandomForest",   y_test, pred_rf)

# Build final table with results
result = df[[
    "Manufacturer",
    "Model",
    "Engine size",
    "Fuel type",
    "Year of manufacture",
    "Mileage"
]].copy()

result["Real price"] = df["Price"]
result["Estimated price"] = df["Estimated price"]
result["Good or bad deal"] = df["Good or bad deal"]

# Show the final table
result


=== Test-set price prediction metrics (lower is better) ===
Naive baseline | MAE: 5,285 | RMSE: 11,462 | R²: 0.516
LinearReg      | MAE: 5,786 | RMSE: 8,868 | R²: 0.710
RandomForest   | MAE: 293 | RMSE: 647 | R²: 0.998


Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Real price,Estimated price,Good or bad deal
0,Ford,Fiesta,1.0,Petrol,2002.0,127300.0,3074.0,3855.875,Good deal
1,Porsche,718 Cayman,4.0,Petrol,2016.0,57850.0,49704.0,24528.375,Bad deal
2,Ford,Mondeo,1.6,Diesel,2014.0,39190.0,24072.0,19061.375,Bad deal
3,Toyota,RAV4,1.8,Hybrid,1988.0,210814.0,1705.0,0.000,Good deal
4,VW,Polo,1.0,Petrol,2006.0,127869.0,4101.0,7027.425,Good deal
...,...,...,...,...,...,...,...,...,...
49995,BMW,M5,5.0,Petrol,2018.0,28664.0,113006.0,29587.675,Bad deal
49996,Toyota,Prius,1.8,Hybrid,2003.0,105120.0,9430.0,7364.875,Good deal
49997,Ford,Mondeo,1.6,Diesel,2022.0,4030.0,49852.0,27219.375,Bad deal
49998,Ford,Focus,1.0,Diesel,2016.0,26468.0,23630.0,20097.475,Good deal
