In [57]:
import pandas as pd
import numpy as np

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [58]:
data = pd.read_csv(r"C:\Users\anish\Downloads\Projects-20240722T093004Z-001\Projects\vehicle_price_prediction\Vehicle Price Prediction\dataset.csv")  

# See the first 5 rows
data.head()

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,2023 Dodge Durango Pursuit,White Knuckle Clearcoat 2023 Dodge Durango Pur...,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,2024 RAM 3500 Laramie,\n \n 2024 Ram 3500 Laramie Billet...,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive


In [59]:
# 1️⃣ List columns
print(data.columns)

# 2️⃣ Check data types
print(data.dtypes)

# 3️⃣ Check for missing values
print(data.isnull().sum())


Index(['name', 'description', 'make', 'model', 'year', 'price', 'engine',
       'cylinders', 'fuel', 'mileage', 'transmission', 'trim', 'body', 'doors',
       'exterior_color', 'interior_color', 'drivetrain'],
      dtype='object')
name               object
description        object
make               object
model              object
year                int64
price             float64
engine             object
cylinders         float64
fuel               object
mileage           float64
transmission       object
trim               object
body               object
doors             float64
exterior_color     object
interior_color     object
drivetrain         object
dtype: object
name                0
description        56
make                0
model               0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
ex

In [60]:
# Drop the description column (as it's not needed for prediction)
data = data.drop(columns=['description'])

# Fill numerical columns
for col in ['cylinders', 'mileage', 'doors', 'price']:
    data[col] = data[col].fillna(data[col].median())

# Fill categorical columns
cat_columns = data.select_dtypes(include='object').columns
for col in cat_columns:
    data[col] = data[col].fillna(data[col].mode()[0])

# ✅ Check if any missing values remain
print(data.isnull().sum())


name              0
make              0
model             0
year              0
price             0
engine            0
cylinders         0
fuel              0
mileage           0
transmission      0
trim              0
body              0
doors             0
exterior_color    0
interior_color    0
drivetrain        0
dtype: int64


In [61]:
# Identify categorical columns
cat_columns = data.select_dtypes(include='object').columns

# Perform one-hot encoding
data_encoded = pd.get_dummies(data, columns=cat_columns, drop_first=True)

# Inspect the new data
print(data_encoded.shape)
data_encoded.head()


(1002, 1241)


Unnamed: 0,year,price,cylinders,mileage,doors,name_2023 Buick Envision Preferred,name_2023 Chrysler 300 Touring,name_2023 Chrysler 300 Touring L,name_2023 Chrysler Pacifica Touring L,name_2023 Dodge Charger SXT,...,interior_color_Tupelo,interior_color_Tupelo/Black,interior_color_Volcano Brown,interior_color_White,interior_color_Wicker Beige/Black,interior_color_Wicker Beige/Global Black,interior_color_gray,drivetrain_Four-wheel Drive,drivetrain_Front-wheel Drive,drivetrain_Rear-wheel Drive
0,2024,74600.0,6.0,10.0,4.0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,2024,50170.0,6.0,1.0,4.0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,2024,96410.0,8.0,0.0,4.0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,2023,46835.0,8.0,32.0,4.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2024,81663.0,6.0,10.0,4.0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [62]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Split the data
X = data_encoded.drop('price', axis=1)
y = data_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Build and train the model
model = RandomForestRegressor(random_state=42, n_estimators=100, n_jobs=-1)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: ${mae:.2f}")
print(f"R^2 Score: {r2:.4f}")


Mean Absolute Error: $3947.33
R^2 Score: 0.7861


In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Our current X_train, X_test, y_train, y_test are assumed to be already defined


In [64]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}


In [65]:
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='neg_mean_absolute_error'
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [66]:
from sklearn.metrics import mean_absolute_error, r2_score

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Model MAE: ${mae:.2f}")
print(f"Best Model R^2 Score: {r2:.4f}")


Best Model MAE: $3964.71
Best Model R^2 Score: 0.8126
