## Import Libraries and Data

In [44]:
from google.colab import files
uploaded = files.upload()

# Upgrade pip
%pip install --upgrade pip

# Install specific versions of the packages to avoid conflicts
%pip install shap==0.46.0 xgboost==2.1.1

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


Saving cars_us_2022.csv to cars_us_2022 (3).csv


## Data Loading and Cleaning

In [45]:
# Load the CSV file
file_path = 'cars_us_2022.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head(15)

# Display general information about the dataframe
data.info()
print()

# Display the number of missing values in each column
print(data.isnull().sum())

# Extract year from 'Model.Number' if available, else set to NaN
data['Year'] = data['Model.Number'].str.extract(r'(\d{4})')
data['Year'] = pd.to_numeric(data['Year'], errors='coerce')

# Keep only the rows where the Year is either 2022 or 2023
data = data[data['Year'].isin([2022, 2023])].copy()

# Add 'Age' column based on the 'Year' column
current_year = 2024
data.loc[:, 'Age'] = current_year - data['Year']

# Check if 'Mileage' column is present and fill missing values with 0
if 'Mileage' in data.columns:
    data['Mileage'] = data['Mileage'].fillna(0)
else:
    data['Mileage'] = 0

# Feature Engineering: Create new features based on existing data
data['Power_to_Weight_Ratio'] = data['Power.hp'] / data['Displacement.l']
data['Torque_to_Weight_Ratio'] = data['Torque.lbft'] / data['Displacement.l']
data['Fuel_Efficiency'] = (data['MPG.City'] + data['MPG.Highway']) / 2

# Fill missing values with median for numeric columns
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].apply(lambda col: col.fillna(col.median()))

# Convert categorical variables to dummy variables if they exist in the data
categorical_columns = ['Brand', 'Engine.Type', 'Body.Type', 'Fuel.Type', 'Gearbox.Type']
available_categorical_columns = [col for col in categorical_columns if col in data.columns]
data = pd.get_dummies(data, columns=available_categorical_columns, drop_first=True)

print()

# Verify cleaned data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2602 entries, 0 to 2601
Data columns (total 76 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         2602 non-null   int64  
 1   Brand                              2602 non-null   object 
 2   Model.Number                       2602 non-null   object 
 3   Price                              2602 non-null   float64
 4   Photo                              2602 non-null   object 
 5   Engine.Type                        2527 non-null   object 
 6   Body.Type                          2564 non-null   object 
 7   Fuel.Type                          2602 non-null   object 
 8   Gearbox.Type                       2602 non-null   object 
 9   Power.hp                           2256 non-null   float64
 10  Displacement.l                     2023 non-null   float64
 11  Torque.lbft                        2168 non-null   float

## Data Splitting

In [46]:
# Define features (X) and target (y)
X = data.drop(columns=['Unnamed: 0', 'Photo', 'Price', 'Model.Number', 'Year'])
y = data['Price']

# Ensure all features are numeric
X = X.apply(pd.to_numeric, errors='coerce')
X.fillna(0, inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scaling Data

In [47]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, 'scaler_with_new_features.pkl')

['scaler_with_new_features.pkl']

## Define and Train the Voting Regressor with Weighted Models

In [60]:
# Define and train the Voting Regressor with weighted models
voting_regressor_weighted = VotingRegressor(
    estimators=[
        ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
        ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42)),
        ('xgb', XGBRegressor(n_estimators=100, random_state=42))
    ],
    weights=[0.3, 0.3, 0.4]
)

voting_regressor_weighted.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_voting_weighted = voting_regressor_weighted.predict(X_test_scaled)

# Evaluate the Voting Regressor
mae_voting_weighted = mean_absolute_error(y_test, y_pred_voting_weighted)
mse_voting_weighted = mean_squared_error(y_test, y_pred_voting_weighted)
rmse_voting_weighted = np.sqrt(mse_voting_weighted)
r2_voting_weighted = r2_score(y_test, y_pred_voting_weighted)
accuracy_voting_weighted = (1 - (mae_voting_weighted / y_test.mean())) * 100

print(f"Voting Regressor (weighted) - MAE: {mae_voting_weighted}")
print(f"Voting Regressor (weighted) - MSE: {mse_voting_weighted}")
print(f"Voting Regressor (weighted) - RMSE: {rmse_voting_weighted}")
print(f"Voting Regressor (weighted) - R²: {r2_voting_weighted}")
print(f"Voting Regressor (weighted) - Accuracy: {accuracy_voting_weighted:.2f}%")
print()

# Save the Voting Regressor with the new features
joblib.dump((voting_regressor_weighted, X.columns), 'best_voting_regressor.pkl')

Voting Regressor (weighted) - MAE: 35448.59906923415
Voting Regressor (weighted) - MSE: 23450684049.802467
Voting Regressor (weighted) - RMSE: 153136.16179662617
Voting Regressor (weighted) - R²: 0.7997491321520126
Voting Regressor (weighted) - Accuracy: 69.35%



['best_voting_regressor.pkl']

## Price Prediction Example

In [49]:
# Load the trained Voting Regressor model
pipeline_voting_with_columns = joblib.load('best_voting_regressor_with_columns_weighted_with_new_features.pkl')
loaded_voting_model, loaded_voting_columns = pipeline_voting_with_columns

# Load the scaler used for training the models
scaler = joblib.load('scaler_with_new_features.pkl')

# Function to predict future prices and calculate accuracy
def predict_future_prices_and_accuracy(car_features, model, scaler, columns, actual_prices, years=5, depreciation_rate=0.15):
    predictions = []
    original_age = car_features['Age']
    original_mileage = car_features['Mileage']

    for i in range(1, years + 1):
        # Increment the 'Age' of the car by 1 year
        car_features['Age'] = original_age + i
        # Assume an additional 10,000 miles driven each year (adjust as needed)
        car_features['Mileage'] = original_mileage + (10000 * i)

        # Ensure the features used during training are consistent
        car_features_df = pd.DataFrame([car_features]).reindex(columns=columns, fill_value=0)

        # Fill any remaining NaN values with 0
        car_features_df = car_features_df.apply(pd.to_numeric, errors='coerce').fillna(0)

        # Scale the features
        car_features_scaled = scaler.transform(car_features_df)
        predicted_price = model.predict(car_features_scaled)[0]

        # Apply depreciation rate
        predicted_price *= (1 - depreciation_rate) ** i

        predictions.append(predicted_price)

    # Convert actual_prices to a NumPy array
    actual_prices_np = np.array(actual_prices)

    # Calculate accuracy
    mae = mean_absolute_error(actual_prices_np[:years], predictions)
    accuracy = (1 - (mae / actual_prices_np.mean())) * 100

    return predictions, accuracy

# Select a random car from the dataset
random_car = data.sample(1).iloc[0]
car_features = random_car.drop(['Unnamed: 0', 'Photo', 'Price', 'Model.Number', 'Year']).to_dict()
car_price = random_car['Price']
car_model = random_car['Model.Number']
car_year = random_car['Year']

# Handle categorical variables in the car features
for key, value in car_features.items():
    if isinstance(value, str):
        car_features[key] = 0  # Or some appropriate numerical value

# Ensure the car features match the trained model's columns
car_features_df = pd.DataFrame([car_features]).reindex(columns=loaded_voting_columns, fill_value=0)

# Convert all features to numeric and fill NaN values with 0
car_features_df = car_features_df.apply(pd.to_numeric, errors='coerce').fillna(0)

# Scale the example features
car_features_scaled = scaler.transform(car_features_df)

# Predict the current price using the Voting Regressor model
current_price_voting = loaded_voting_model.predict(car_features_scaled)[0]

# Define the actual prices for the next 5 years for evaluation purposes
actual_prices = [car_price * (1 - 0.15) ** i for i in range(1, 6)]

# Predict future prices for the next 5 years
future_prices_voting, accuracy_voting = predict_future_prices_and_accuracy(car_features, loaded_voting_model, scaler, loaded_voting_columns, actual_prices, years=5, depreciation_rate=0.15)

# Display the results for the Voting Regressor
print(f"Model: {car_model}")
print(f"Year: {car_year}")
print(f"Initial Price: {round(car_price)}")
print("Predictions:")
for i, price in enumerate(future_prices_voting, start=1):
    print(f" {car_year + i}: {round(price)}")
print(f"Accuracy of Predictions: {accuracy_voting:.2f}%")
print(f"Model used: Voting Regressor")

Model: GV60 Advanced 2023
Year: 2023.0
Initial Price: 59290
Predictions:
 2024.0: 46635
 2025.0: 39640
 2026.0: 33694
 2027.0: 28640
 2028.0: 24344
Accuracy of Predictions: 92.54%
Model used: Voting Regressor


## Test Prediction with 3 Cars Over 5 Years

In [57]:
# Load the trained Voting Regressor model
pipeline_voting_with_columns = joblib.load('best_voting_regressor_with_columns_weighted_with_new_features.pkl')
loaded_voting_model, loaded_voting_columns = pipeline_voting_with_columns

# Load the scaler used for training the models
scaler = joblib.load('scaler_with_new_features.pkl')

# Function to predict prices for the next 5 years
def predict_prices(car_features, model, scaler, columns, start_year, years=5, depreciation_rate=0.15):
    predictions = {}
    original_age = car_features['Age']
    original_mileage = car_features['Mileage']

    for i in range(1, years + 1):
        car_features['Age'] = original_age + i
        car_features['Mileage'] = original_mileage + (10000 * i)

        car_features_df = pd.DataFrame([car_features]).reindex(columns=columns, fill_value=0)

        # Fill any remaining NaN values with 0
        car_features_df = car_features_df.apply(pd.to_numeric, errors='coerce').fillna(0)

        car_features_scaled = scaler.transform(car_features_df)
        predicted_price = model.predict(car_features_scaled)[0]

        predicted_price *= (1 - depreciation_rate) ** i
        predictions[start_year + i] = round(predicted_price)

    return predictions

# Function to calculate prediction accuracy
def calculate_accuracy(predictions, actual_prices):
    actual_prices_np = np.array(actual_prices)
    predicted_prices_np = np.array(list(predictions.values())[:len(actual_prices)])
    mae = mean_absolute_error(actual_prices_np, predicted_prices_np)
    accuracy = (1 - (mae / actual_prices_np.mean())) * 100
    return accuracy

# Define the depreciation rate
depreciation_rate = 0.15

# Select 3 random cars from the dataset
sample_cars = data.sample(3)

# Display the predictions for the selected cars
for index, car in sample_cars.iterrows():
    model = car['Model.Number']
    year = int(car['Model.Number'].split()[-1])
    initial_price = car['Price']

    # Handle any categorical variables in the features
    car_features = car.drop(['Unnamed: 0', 'Photo', 'Price', 'Model.Number', 'Year']).to_dict()
    for key, value in car_features.items():
        if isinstance(value, str):
            car_features[key] = 0  # Or some appropriate numerical value

    predictions = predict_prices(car_features, loaded_voting_model, scaler, loaded_voting_columns, year, depreciation_rate=depreciation_rate)

    # Define the actual prices for the next 5 years for evaluation purposes
    actual_prices = [initial_price * (1 - depreciation_rate) ** i for i in range(1, 6)]

    accuracy = calculate_accuracy(predictions, actual_prices)

    print(f"Model: {model}")
    print(f"Year: {year}")
    print(f"Initial Price: {round(initial_price)}")
    print("Predictions:")
    for pred_year, price in predictions.items():
        print(f" {pred_year}: {price}")
    print(f"Accuracy of Predictions: {accuracy:.2f}%")
    print(f"Model used: Voting Regressor")
    print()

Model: 300 Touring L 2023
Year: 2023
Initial Price: 36740
Predictions:
 2024: 33583
 2025: 28545
 2026: 24263
 2027: 20624
 2028: 17530
Accuracy of Predictions: 92.46%
Model used: Voting Regressor

Model: Quattroporte Modena Q4 2022
Year: 2022
Initial Price: 110300
Predictions:
 2023: 82791
 2024: 70372
 2025: 59816
 2026: 50844
 2027: 43217
Accuracy of Predictions: 88.31%
Model used: Voting Regressor

Model: 500X Trekking Plus 2022
Year: 2022
Initial Price: 31860
Predictions:
 2023: 26185
 2024: 22257
 2025: 18919
 2026: 16081
 2027: 13669
Accuracy of Predictions: 96.69%
Model used: Voting Regressor

