In [169]:
import pandas as pd

# This library lets us work with desition trees
from sklearn.tree import DecisionTreeRegressor

# Metrics help us get our errors for accuracy
from sklearn.metrics import mean_absolute_error

#Train_test_split is fundamental to divide our data, test it and try our models
from sklearn.model_selection import train_test_split

# Imports Random Forests
from sklearn.ensemble import RandomForestRegressor


import matplotlib.pyplot as plt


from sklearn.tree import plot_tree

import numpy as np


df= pd.read_csv('data/car_price_prediction_.csv')
df = df.drop('Car ID', axis = 1)

In [170]:


from sklearn.preprocessing import MinMaxScaler

# Select the numerical columns
numerical_cols = ['Year', 'Engine Size', 'Mileage']

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Apply the scaler to the numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])




In [171]:
X = df.drop(columns='Price', axis=1) 
X = pd.get_dummies(X, drop_first=True)

Y = df.Price  

In [172]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)


In [173]:

# Define the model
model = DecisionTreeRegressor()

# Fit the model using the training data
model.fit(X_train, Y_train)

# Get predicted prices on the testing data
Y_pred = model.predict(X_test)

In [174]:
from sklearn.metrics import r2_score, mean_squared_error

print(f"""
RMSE: {np.sqrt(mean_squared_error(Y_test, Y_pred))}
R2: {r2_score(Y_test, Y_pred)}
""")
mae_test = mean_absolute_error(Y_test, Y_pred)
print("Mean Absolute Error on test data:", mae_test)


RMSE: 37246.202076282614
R2: -0.7717253173010035

Mean Absolute Error on test data: 29819.811824


In [175]:
# I'm going to create a function that tests different max nodes
# to see which number of nodes is best

def get_mae(max_leaf_nodes, X_train, X_test, Y_train, Y_test):
    # Define and fit the model with specified max_leaf_nodes
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(X_train, Y_train)

    # Make predictions on the validation set
    preds_val = model.predict(X_test)

    # Calculate Mean Absolute Error
    mae = mean_absolute_error(Y_test, preds_val)
    
    return mae


In [176]:
Mae_compare = [5 * 10**number for number in range(8)]


lowest_mae = float('inf')  # Initialize to a high value
lowest_node = None  # Initialize lowest_node
i = 0

# We are going to work with a mini controlled forest
# where we can compare different MAEs manually

# Compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in Mae_compare:
    # Get the Mean Absolute Error for the current max_leaf_nodes value
    my_mae = get_mae(max_leaf_nodes, X_train, X_test, Y_train, Y_test)

    # Print the results in a formatted string
    print(f"Max leaf nodes: {max_leaf_nodes} \t Mean Absolute Error: {my_mae:.2f}")
    if my_mae < lowest_mae or max_leaf_nodes == Mae_compare[0]:
        lowest_mae = my_mae
        lowest_node = Mae_compare[i]
    i += 1

print(f"Therefore the best node is {lowest_node} with {lowest_mae:.2f}")


Max leaf nodes: 5 	 Mean Absolute Error: 24409.38
Max leaf nodes: 50 	 Mean Absolute Error: 24781.90
Max leaf nodes: 500 	 Mean Absolute Error: 28024.03
Max leaf nodes: 5000 	 Mean Absolute Error: 29942.47
Max leaf nodes: 50000 	 Mean Absolute Error: 29942.47
Max leaf nodes: 500000 	 Mean Absolute Error: 29942.47
Max leaf nodes: 5000000 	 Mean Absolute Error: 29942.47
Max leaf nodes: 50000000 	 Mean Absolute Error: 29942.47
Therefore the best node is 5 with 24409.38


### Random Forest

In [177]:
# To work with random forest, first we call the object we are gonna work on
# This is alike the code above, tho its a more powerfull tool

forest_model = RandomForestRegressor(random_state=1)

# Then train it with our variables
forest_model.fit(X_train, Y_train)

# calculate its predictions
forest_preds = forest_model.predict(X_test)

# and compare how good was our model against unseen data

forest_mae=mean_absolute_error(Y_test, forest_preds)

In [178]:
# Calculate the mean of Y (target variable)
mean_y = Y_test.mean()

# Calculate the percentage of MAE relative to the mean of Y
mae_percentage = (forest_mae / mean_y) * 100

print(f"The Mean Absolute Error is {forest_mae:.2f}, which is {mae_percentage:.2f}% of the mean of Y.")
print(f"This means if our model makes a prediction it'll be +-{mae_percentage:.2f}% trustworthy")


The Mean Absolute Error is 24574.47, which is 46.89% of the mean of Y.
This means if our model makes a prediction it'll be +-46.89% trustworthy
