In [None]:
"""
(13) Regressão com Random Forest

Crie um modelo de Random Forest para prever o preço de carros. Compare os
resultados com um modelo de regressão linear usando MSE e R².
__________________________________________________________________________________________

(13) Regression with Random Forest

Create a Random Forest model to predict the price of cars. Compare the
results with a linear regression model using MSE and R².

"""

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
used_cars_df = pd.read_csv("used-cars-data.csv", encoding="utf-8")

print(used_cars_df.info())
print("\n")
used_cars_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   object
 4   fuel_type     3839 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      3896 non-null   object
 10  clean_title   3413 non-null   object
 11  price         4009 non-null   object
dtypes: int64(1), object(11)
memory usage: 376.0+ KB
None




Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$10,300"
1,Hyundai,Palisade SEL,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,"$38,005"
2,Lexus,RX 350 RX 350,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,"$54,598"
3,INFINITI,Q50 Hybrid Sport,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,"$15,500"
4,Audi,Q3 45 S line Premium Plus,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,"$34,999"


In [3]:
# PRE-PROCESSING:
# We have to turn the columns with type 'object' into numerical values

# First, we have to remove the special characters from the 'price' and 'milage' columns
used_cars_df['price'] = used_cars_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

used_cars_df['milage'] = used_cars_df['milage'].replace({' mi.': '', ',': ''}, regex=True).astype(int)

# Transforming the columns with One-hot Encoding
# This turns each category into binary
used_cars_df = pd.get_dummies(used_cars_df, columns=['brand', 'model', 'fuel_type',
                                                     'transmission', 'ext_col', 'int_col',
                                                     'accident', 'clean_title'], drop_first=True)

In [4]:
print(used_cars_df.info())
print("\n")
used_cars_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Columns: 2498 entries, model_year to accident_None reported
dtypes: bool(2494), float64(1), int64(2), object(1)
memory usage: 9.7+ MB
None




Unnamed: 0,model_year,milage,engine,price,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,...,int_col_Tupelo,int_col_Very Light Cashmere,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_None reported
0,2013,51000,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,10300.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2021,34742,3.8L V6 24V GDI DOHC,38005.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2022,22372,3.5 Liter DOHC,54598.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,2015,88900,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,15500.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,2021,9835,2.0L I4 16V GDI DOHC Turbo,34999.0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [10]:
# Defining the study features and the prediction feature
x = used_cars_df.drop(['engine', 'price'], axis=1)
y = used_cars_df['price']

# Splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
# Initializing Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# Predicting
rf_pred = rf_model.predict(x_test)

In [14]:
# RESULTS 1: RANDOM FOREST TRAINING
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print("RANDOM FOREST MSE:")
print(rf_mse)
print("\n")
print("RANDOM FOREST R²:")
print(rf_r2)

RANDOM FOREST MSE:
18145760724.567825


RANDOM FOREST R²:
0.11222399517629544


In [15]:
# Initializing Linear Regression
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

# Predicting
lr_pred = lr_model.predict(x_test)

In [16]:
# RESULTS 2: LINEAR REGRESSION TRAINING
lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

print("LINEAR REGRESSION MSE:")
print(lr_mse)
print("\n")
print("LINEAR REGRESSION R²:")
print(lr_r2)

LINEAR REGRESSION MSE:
19406180037.462116


LINEAR REGRESSION R²:
0.05055835111823492
