In [40]:
import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, TargetEncoder


In [41]:
df = pd.read_csv('data/cleaned_car_data.csv')

In [42]:
df['Model']

0        Expedition
1           Durango
2          Eldorado
3            Celica
4                TL
            ...    
21279       Voyager
21280         Prizm
21281          328i
21282         Metro
21283         ES300
Name: Model, Length: 21284, dtype: object

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21284 entries, 0 to 21283
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_id         21284 non-null  object 
 1   Date           21284 non-null  object 
 2   Customer Name  21284 non-null  object 
 3   Gender         21284 non-null  object 
 4   Annual Income  21284 non-null  float64
 5   Dealer_Name    21284 non-null  object 
 6   Company        21284 non-null  object 
 7   Model          21284 non-null  object 
 8   Engine         21284 non-null  object 
 9   Transmission   21284 non-null  object 
 10  Color          21284 non-null  object 
 11  Price ($)      21284 non-null  float64
 12  Dealer_No      21284 non-null  object 
 13  Body Style     21284 non-null  object 
 14  Phone          21284 non-null  int64  
 15  Dealer_Region  21284 non-null  object 
 16  Brand Segment  21284 non-null  object 
 17  Brand Country  21284 non-null  object 
dtypes: flo

In [44]:
cols_to_drop = ['Car_id', 'Date', 'Customer Name', 'Gender', 'Annual Income', 'Dealer_Name', 'Company', 'Engine', 'Dealer_No ', 'Phone', 'Dealer_Region']
df.drop(columns=cols_to_drop, inplace=True)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21284 entries, 0 to 21283
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Model          21284 non-null  object 
 1   Transmission   21284 non-null  object 
 2   Color          21284 non-null  object 
 3   Price ($)      21284 non-null  float64
 4   Body Style     21284 non-null  object 
 5   Brand Segment  21284 non-null  object 
 6   Brand Country  21284 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.1+ MB


In [46]:
X = df.drop('Price ($)', axis=1)
y = df['Price ($)']

In [47]:
nominal_cols = ['Color', 'Body Style', 'Brand Country']
one_hot_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

ordinal_cols = ['Transmission', 'Brand Segment']
ordinal_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

target_cols = ['Model']
target_enc = TargetEncoder(target_type='continuous')

In [48]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', one_hot_enc, nominal_cols),
        ('ordinal', ordinal_enc, ordinal_cols),
        #In ColumnTransformer, it uses ordinal encoder instead of label, both do the same thing, but it's just a different way to do it
        ('target', target_enc, target_cols)
    ],
    verbose_feature_names_out=False, # Keeps names clean
    remainder='drop' # Drop unused columns (to future proof it when I use it in api)
)

In [49]:
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=15))
])

In [50]:
final_pipeline.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,target_type,'continuous'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,15
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [52]:
joblib.dump(final_pipeline, 'model/final_car_price_model.pkl')
# The file is above 100MB, I added it to gitignore instead
#but when you run it, it will save it in the model dir normally!

['model/final_car_price_model.pkl']

In [55]:
test_input = pd.DataFrame({
    'Model': ['Expedition'],
    'Transmission': ['Auto'],
    'Color': ['Red'],
    'Body Style': ['SUV'],
    'Brand Segment': ['Luxury'],
    'Brand Country': ['Germany']
})

# Load the model
loaded_model = joblib.load('model/final_car_price_model.pkl')
predicted_price = loaded_model.predict(test_input)

print(f"Predicted Price for your car: ${np.expm1(predicted_price[0]):,.2f}") # Our price is still log transformed so we use np.expm1

Predicted Price for your car: $20,947.97
