This is an extension of the Car Analysis project, focused on building a predictive model using machine learning and neural networks.

This approach employs a data pipeline that standardises numeric data and encodes categorical data, which is then input into a tuned Random Forest regressor as well as a neural network.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


Importing and preparing the data following the same method outlined in the Car Analysis Report

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil import parser
car23=pd.read_csv('cardata_feb_apr_cleaned.csv')
car23df=pd.DataFrame(car23)
# Replace '---' in NCT.Expiry with NaN
car23df['NCT Expiry'] = car23df['NCT Expiry'].replace('---', np.nan)

# Parse dates in NCT.Expiry
car23df['NCT Expiry'] = pd.to_datetime(car23df['NCT Expiry'], format='%b %Y', errors='coerce')

# Parse dates in Date.Uploaded with multiple formats
car23df['Date Uploaded'] = pd.to_datetime(car23df['Date Uploaded'], errors='coerce',
                                       format='%Y-%m-%d').fillna(
                                       pd.to_datetime(car23df['Date Uploaded'], format='%d-%m-%Y', errors='coerce')).fillna(
                                       pd.to_datetime(car23df['Date Uploaded'], format='%b %Y', errors='coerce'))

# Calculate NCT.Duration in months
car23df['NCT Duration'] = (
    (car23df['Date Uploaded'].dt.year * 12 + car23df['Date Uploaded'].dt.month) -
    (car23df['NCT Expiry'].dt.year * 12 + car23df['NCT Expiry'].dt.month)
)

# Clean and convert Engine.Size..Litres. to numeric, filter invalid sizes
car23df['Engine Size (Litres)'] = pd.to_numeric(car23df['Engine Size (Litres)'].str.replace('litre', '', regex=False), errors='coerce')
car23df.loc[(car23df['Engine Size (Litres)'] > 4) | (car23df['Engine Size (Litres)'] < 0.05), 'Engine Size (Litres)'] = np.nan

# Adjust Year
car23df['Year'] = car23df['Year'] - 2000

# Adjust Mileage
car23df['Mileage (km)'] = car23df['Mileage (km)'] / 1000

# Convert categorical columns to pandas categories
car23df['Model'] = car23df['Model'].astype('category')
car23df['Fuel Type'] = car23df['Fuel Type'].astype('category')

# Convert Previous.Owners to numeric
car23df['Previous Owners'] = pd.to_numeric(car23df['Previous Owners'], errors='coerce')

# Filter out rows with missing values in specified columns
data_NA = car23df.dropna(subset=['Price', 'Year', 'Mileage (km)', 'NCT Duration', 'Engine Size (Litres)', 'Previous Owners'])

# Create Sold.Indicator column
data_NA['Sold Indicator'] = pd.Categorical(
    np.where(data_NA['Date Sold'].notna(), 'Sold', 'Not Sold'),
    categories=['Not Sold', 'Sold']
)

# Filter data and group by Model
filtered_data_23 = (data_NA[(data_NA['Mileage (km)'] <= 500) & (data_NA['Mileage (km)'] > 10) &
                            (data_NA['Price'] < 7000) & (data_NA['Price'] > 500) &
                            (data_NA['NCT Duration'] >= -30) & (data_NA['NCT Duration'] <= 30)]
                    .groupby('Model')
                    .filter(lambda x: len(x) >= 10)
                    .reset_index(drop=True))


float64


In [4]:
import pandas as pd
import numpy as np

# Load your data (assuming it's in a DataFrame named data_2024)
car24=pd.read_csv('cardata_Sept2024_cleaned.csv')
car24df=pd.DataFrame(car24)
# Parse dates in Date.Uploaded with multiple formats
car24df['Date Uploaded'] = pd.to_datetime(car24df['Date Uploaded'], errors='coerce',
                                            format='%Y-%m-%d').fillna(
                                            pd.to_datetime(car24df['Date Uploaded'], format='%d-%m-%Y', errors='coerce')).fillna(
                                            pd.to_datetime(car24df['Date Uploaded'], format='%b %Y', errors='coerce'))

# Convert Model and Fuel.Type to pandas categorical type
car24df['Model'] = car24df['Model'].astype('category')
car24df['Fuel Type'] = car24df['Fuel Type'].astype('category')

# Clean and convert Engine.Size to numeric
car24df['Engine Size (Litres)'] = pd.to_numeric(
    car24df['Engine Size'].str.replace('[^0-9.]', '', regex=True),
    errors='coerce'
)
car24df.loc[(car24df['Engine Size (Litres)'] > 4) | (car24df['Engine Size (Litres)'] < 0.05), 'Engine Size (Litres)'] = np.nan

car24df['Year'] = pd.to_numeric(car24df['Year'], errors='coerce')
car24df['Year'] = car24df['Year'] - 2000

# Adjust Mileage
car24df['Mileage (km)'] = car24df['Mileage (km)'] / 1000

# Convert Previous Owners to numeric
car24df['Previous Owners'] = pd.to_numeric(car24df['Previous Owners'], errors='coerce')

# Filter out rows with missing values in specified columns
data_NA = car24df.dropna(subset=['Price', 'Year', 'Mileage (km)', 'Engine Size (Litres)'])

# Create Sold.Indicator column
data_NA['Sold Indicator'] = pd.Categorical(
    np.where(data_NA['Date Sold'].notna(), 'Sold', 'Not Sold'),
    categories=['Not Sold', 'Sold']
)

# Filter data and group by Model
filtered_data_24 = (data_NA[(data_NA['Mileage (km)'] <= 500) & (data_NA['Mileage (km)'] > 10) &
                            (data_NA['Price'] < 7000) & (data_NA['Price'] > 500)]
                    .groupby('Model')
                    .filter(lambda x: len(x) >= 10)
                    .reset_index(drop=True))


Combining 2023 and 2024 Data

In [5]:
# Add a DatasetYear column to distinguish between 2023 and 2024 datasets
filtered_data_23['DatasetYear'] = '2023'
filtered_data_24['DatasetYear'] = '2024'

combined_data = pd.concat([filtered_data_23, filtered_data_24], ignore_index=True)

In [6]:
combined_data.head(5)


Unnamed: 0,Link,Date Uploaded,Price,Make,Model,Trim,Trim Level,Year,Mileage (km),Fuel Type,...,Country of Registration,Previous Owners,Road Tax,NCT Expiry,Date Sold,NCT Duration,Sold Indicator,DatasetYear,Engine Size,Horsepower
0,https://www.donedeal.ie/cars-for-sale/2013-for...,2023-02-28,6995,Ford,KA,Edge,Base Trim,13.0,130.0,Petrol,...,Ireland,4.0,200,2023-09-01,,-7.0,Not Sold,2023,,
1,https://www.donedeal.ie/cars-for-sale/sale-sal...,2023-02-28,6995,SEAT,Ibiza,S.E.,Medium Trim,14.0,263.0,Diesel,...,Ireland,3.0,190,2022-06-01,,8.0,Not Sold,2023,,
2,https://www.donedeal.ie/cars-for-sale/sale-sal...,2023-02-28,6995,Opel,Astra,SC,Medium Trim,14.0,215.0,Diesel,...,Ireland,4.0,190,2024-12-01,,-22.0,Not Sold,2023,,
3,https://www.donedeal.ie/cars-for-sale/sale-sal...,2023-02-28,5995,Renault,Megane,EXPRESSION,Base Trim,12.0,170.0,Diesel,...,Ireland,3.0,190,2023-09-01,,-7.0,Not Sold,2023,,
4,https://www.donedeal.ie/cars-for-sale/sale-sal...,2023-02-28,5500,Kia,Ceed,TX,Medium Trim,11.0,273.587,Diesel,...,Ireland,1.0,200,2023-11-01,,-9.0,Not Sold,2023,,


Model Evaluation Function

In [7]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Evaluates the performance of a trained model on both the training and test sets.

    Parameters:
    - model: Trained model
    - X_train: Training set features
    - y_train: Training set labels
    - X_test: Test set features
    - y_test: Test set labels

    Returns:
    None (prints the MSE and R² scores for both training and test sets)
    """

    # Predict on the test set
    y_test_pred = model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Predict on the training set
    y_train_pred = model.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)

    # Print the evaluation metrics for both test and training sets
    print(f"Test Set - Mean Squared Error: {test_mse:.2f}, R^2 Score: {test_r2:.2f}")
    print(f"Training Set - Mean Squared Error: {train_mse:.2f}, R^2 Score: {train_r2:.2f}")

Feature and Pipeline Setup

In [8]:
categorical_features = ['Make', 'Model','Transmission','Engine Size (Litres)','Fuel Type','Sold Indicator']
numeric_features = ['Year', 'Mileage (km)']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)  # Handle unknown categories
    ]
)
features = ['Make', 'Model','Transmission','Engine Size (Litres)','Fuel Type','Sold Indicator','Year', 'Mileage (km)']
target = 'Price'
X = combined_data[features]
y = combined_data[target]

In [9]:
# Perform stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=combined_data['Model']
)

In [None]:
basic_modelRF = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])
basic_modelRF.fit(X_train, y_train)

evaluate_model(basic_modelRF, X_train, y_train, X_test, y_test)

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'regressor__max_depth': [10, 30],
    'regressor__min_samples_leaf': [1, 4],
    'regressor__min_samples_split': [2, 10],
    'regressor__n_estimators': [100, 500],
}
modelRF = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Set up GridSearchCV
grid_search = GridSearchCV(
    modelRF,
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='r2',  # Use R² as the evaluation metric
    n_jobs=-1,  # Use all processors for parallelism
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
evaluate_model(best_model, X_train, y_train, X_test, y_test)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'regressor__max_depth': 30, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 500}
Best Cross-Validation Score: 0.6691694709457031
Test Set - Mean Squared Error: 944370.29, R^2 Score: 0.70
Training Set - Mean Squared Error: 424720.25, R^2 Score: 0.86


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'regressor__max_depth': 30, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 500}
Best Cross-Validation Score: 0.6691694709457031
Test Set - Mean Squared Error: 944370.29, R^2 Score: 0.70
Training Set - Mean Squared Error: 424720.25, R^2 Score: 0.86

In [30]:
param_grid = {
    'regressor__max_depth': [25, 30],
    'regressor__min_samples_split': [6,8, 10],
    'regressor__n_estimators': [400, 500],
}
modelRF = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Set up GridSearchCV
grid_search = GridSearchCV(
    modelRF,
    param_grid,
    cv=3,  # 5-fold cross-validation
    scoring='r2',  # Use R² as the evaluation metric
    n_jobs=-1,  # Use all processors for parallelism
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
evaluate_model(best_model, X_train, y_train, X_test, y_test)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'regressor__max_depth': 25, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 500}
Best Cross-Validation Score: 0.6674601455141046
Test Set - Mean Squared Error: 944549.43, R^2 Score: 0.70
Training Set - Mean Squared Error: 465042.03, R^2 Score: 0.85


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'regressor__max_depth': 25, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 500}
Best Cross-Validation Score: 0.6674601455141046
Test Set - Mean Squared Error: 944549.43, R^2 Score: 0.70
Training Set - Mean Squared Error: 465042.03, R^2 Score: 0.85

In [14]:
modelRF = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42,max_depth=25, min_samples_leaf= 1, min_samples_split= 10, n_estimators=500))
])
modelRF.fit(X_train, y_train)

evaluate_model(modelRF, X_train, y_train, X_test, y_test)

Test Set - Mean Squared Error: 944549.43, R^2 Score: 0.70
Training Set - Mean Squared Error: 465042.03, R^2 Score: 0.85


In [10]:
combined_data.iloc[41]

Unnamed: 0,41
Link,https://www.donedeal.ie/cars-for-sale/volkswag...
Date Uploaded,2023-02-28 00:00:00
Price,5250
Make,Volkswagen
Model,Golf
Trim,
Trim Level,
Year,10.0
Mileage (km),280.025
Fuel Type,Diesel


Prediction and 95% prediction interval using the Random Forest Model.

In [15]:
new_data = {
    'Make': ['Volkswagen'],
    'Model': ['Golf'],
    'Transmission': ['Unknown'],
    'Engine Size (Litres)': [1.6],
    'Fuel Type': ['Diesel'],
    'Sold Indicator': ['Sold'],
    'Year': [10],
    'Mileage (km)': [280]
}

# Create a DataFrame for the custom data
prediction_data_df = pd.DataFrame(new_data)

# Extract the RandomForestRegressor from the pipeline
rf_model = modelRF.named_steps['regressor']

# Preprocess the custom data using the pipeline's preprocessor
processed_data = modelRF.named_steps['preprocessor'].transform(prediction_data_df)

# Get predictions from each tree in the Random Forest
individual_tree_predictions = np.array([tree.predict(processed_data) for tree in rf_model.estimators_])

# Calculate the mean prediction
mean_prediction = individual_tree_predictions.mean(axis=0)[0]

# Compute the 95% prediction interval
lower_bound = np.percentile(individual_tree_predictions, 2.5, axis=0)[0]
upper_bound = np.percentile(individual_tree_predictions, 97.5, axis=0)[0]

# Display the results
print(f"Predicted Price: {mean_prediction:.2f}")
print(f"95% Prediction Interval: [{lower_bound:.2f}, {upper_bound:.2f}]")


Predicted Price: 5159.69
95% Prediction Interval: [4109.67, 6200.00]


Neural Network Model

In [11]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


X_train_dense = X_train_transformed.toarray()
X_test_dense = X_test_transformed.toarray()

X_train_tensor = torch.tensor(X_train_dense, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_dense, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 2. Define the Neural Network Model
class CarPriceModel(nn.Module):
    def __init__(self, input_size):
        super(CarPriceModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 8)
        self.fc4 = nn.Linear(8, 1)  # Output layer for regression
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)  # No activation on output for regression
        return x

# Instantiate the model
input_size = X_train_dense.shape[1]  # Number of features
model = CarPriceModel(input_size)

# 3. Loss Function and Optimizer
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 4. Training Loop
epochs = 100
model.train()  # Set model to training mode

for epoch in range(epochs):
    epoch_loss = 0.0
    for X_batch, y_batch in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate batch loss
        epoch_loss += loss.item()
    if (epoch + 1) % 10 == 0:
     print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

    # Print epoch loss


# 5. Evaluate the Model
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    print(f"Test Loss: {test_loss.item():.4f}")


Epoch 10/100, Loss: 384543043.8125
Epoch 20/100, Loss: 370162161.8750
Epoch 30/100, Loss: 365844794.9062
Epoch 40/100, Loss: 364500295.7188
Epoch 50/100, Loss: 362712590.7500
Epoch 60/100, Loss: 361368759.2812
Epoch 70/100, Loss: 359549921.5312
Epoch 80/100, Loss: 357876999.2812
Epoch 90/100, Loss: 356032673.7500
Epoch 100/100, Loss: 353575941.1250
Test Loss: 945021.6875


In [12]:
from sklearn.metrics import r2_score

# Evaluation
model.eval()
with torch.no_grad():
    y_test_pred = model(X_test_tensor)
    test_loss = mean_squared_error(y_test_tensor.numpy(), y_test_pred.numpy())

    # Calculate R² score
    r2 = r2_score(y_test_tensor.numpy(), y_test_pred.numpy())  # Convert tensors to numpy arrays
    print(f"Test MSE: {test_loss}")
    print(f"R² Score: {r2}")


Test MSE: 945021.625
R² Score: 0.6982784271240234


In [13]:
new_data = {
    'Make': ['Volkswagen'],
    'Model': ['Golf'],
    'Transmission': ['Unknown'],
    'Engine Size (Litres)': [1.6],
    'Fuel Type': ['Diesel'],
    'Sold Indicator': ['Sold'],
    'Year': [10],  # Adjusted year
    'Mileage (km)': [280]  # Mileage in thousands
}
new_data_df = pd.DataFrame(new_data)
new_data_transformed = preprocessor.transform(new_data_df)
new_data_tensor = torch.tensor(new_data_transformed.toarray(), dtype=torch.float32)

# Make prediction
with torch.no_grad():
    custom_prediction = model(new_data_tensor)
    print(f"Predicted Price: {custom_prediction.item():.2f}")

Predicted Price: 4655.71
