In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
from zipfile import ZipFile
from datetime import datetime

In [2]:
with ZipFile('archive.zip', 'r') as zipped_file: 
    zipped_file.extractall('archive')

In [3]:
os.listdir('archive/')

['used_cars.csv']

In [4]:
used_car_data = pd.read_csv('archive/used_cars.csv')
used_car_data.shape

(4009, 12)

In [5]:
used_car_data.columns

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price'],
      dtype='object')

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
y_full = used_car_data['price']
X_full = used_car_data.drop('price', axis=1)

In [8]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X_full, y_full, shuffle=True, test_size=0.20)

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, shuffle=True, test_size=0.20)

In [10]:
X_train.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
3944,Volkswagen,Beetle 2.0T Final Edition SE,2019,"32,000 mi.",Gasoline,174.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes
3269,Subaru,Impreza WRX,2012,"106,000 mi.",Gasoline,265.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,M/T,Black,Black,None reported,Yes
238,Porsche,Cayman S,2014,"33,088 mi.",Gasoline,325.0HP 3.4L Flat 6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Black,None reported,Yes
2495,Acura,TLX Type S PMC Edition,2023,"13,600 mi.",Gasoline,355.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,White,None reported,Yes
2552,Mercedes-Benz,AMG GT 53 Base,2022,"2,235 mi.",Hybrid,429.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Red,None reported,Yes


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2565 entries, 3944 to 1263
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         2565 non-null   object
 1   model         2565 non-null   object
 2   model_year    2565 non-null   int64 
 3   milage        2565 non-null   object
 4   fuel_type     2463 non-null   object
 5   engine        2565 non-null   object
 6   transmission  2565 non-null   object
 7   ext_col       2565 non-null   object
 8   int_col       2565 non-null   object
 9   accident      2498 non-null   object
 10  clean_title   2183 non-null   object
dtypes: int64(1), object(10)
memory usage: 240.5+ KB


In [12]:
cat_cols = [col for col in X_train if X_train[col].dtype == 'object']
for col in cat_cols:
    print(f'{col} ==> {X_train[col].nunique()}')

brand ==> 55
model ==> 1457
milage ==> 1899
fuel_type ==> 6
engine ==> 930
transmission ==> 51
ext_col ==> 234
int_col ==> 114
accident ==> 2
clean_title ==> 1


In [13]:
sum(X_train['clean_title'].isnull())

382

In [14]:
X_explore = X_train[:50].copy()

In [15]:
X_explore.head(3)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
3944,Volkswagen,Beetle 2.0T Final Edition SE,2019,"32,000 mi.",Gasoline,174.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes
3269,Subaru,Impreza WRX,2012,"106,000 mi.",Gasoline,265.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,M/T,Black,Black,None reported,Yes
238,Porsche,Cayman S,2014,"33,088 mi.",Gasoline,325.0HP 3.4L Flat 6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Black,None reported,Yes


In [16]:
X_explore['milage'] = X_explore['milage'].apply(lambda x: int(x.strip('mi.').replace(',', '_')))

In [17]:
X_explore['age'] = datetime.now().year - X_explore['model_year']
X_explore.drop('model_year', axis=1, inplace=True)

In [18]:
X_explore.head(3)

Unnamed: 0,brand,model,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,age
3944,Volkswagen,Beetle 2.0T Final Edition SE,32000,Gasoline,174.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes,6
3269,Subaru,Impreza WRX,106000,Gasoline,265.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,M/T,Black,Black,None reported,Yes,13
238,Porsche,Cayman S,33088,Gasoline,325.0HP 3.4L Flat 6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Black,None reported,Yes,11


In [19]:
X_explore['accident'].value_counts()

accident
None reported                             38
At least 1 accident or damage reported    11
Name: count, dtype: int64

In [20]:
X_explore['accident'] = X_explore['accident'] == 'None reported'

In [21]:
X_explore['accident'] = X_explore['accident'].astype(int)

In [22]:
X_explore.head()

Unnamed: 0,brand,model,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,age
3944,Volkswagen,Beetle 2.0T Final Edition SE,32000,Gasoline,174.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,1,Yes,6
3269,Subaru,Impreza WRX,106000,Gasoline,265.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,M/T,Black,Black,1,Yes,13
238,Porsche,Cayman S,33088,Gasoline,325.0HP 3.4L Flat 6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Black,1,Yes,11
2495,Acura,TLX Type S PMC Edition,13600,Gasoline,355.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,White,1,Yes,2
2552,Mercedes-Benz,AMG GT 53 Base,2235,Hybrid,429.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Red,1,Yes,3


In [23]:
X_explore.drop(['model', 'int_col', 'ext_col'], axis=1, inplace=True)

In [24]:
X_explore.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 3944 to 3131
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         50 non-null     object
 1   milage        50 non-null     int64 
 2   fuel_type     47 non-null     object
 3   engine        50 non-null     object
 4   transmission  50 non-null     object
 5   accident      50 non-null     int64 
 6   clean_title   40 non-null     object
 7   age           50 non-null     int64 
dtypes: int64(3), object(5)
memory usage: 3.5+ KB


In [25]:
X_explore['clean_title'].value_counts()

clean_title
Yes    40
Name: count, dtype: int64

In [26]:
X_explore['clean_title'] = X_explore['clean_title'] == 'Yes'

In [27]:
X_explore['clean_title'] = X_explore['clean_title'].astype(int)

In [28]:
X_explore.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 3944 to 3131
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         50 non-null     object
 1   milage        50 non-null     int64 
 2   fuel_type     47 non-null     object
 3   engine        50 non-null     object
 4   transmission  50 non-null     object
 5   accident      50 non-null     int64 
 6   clean_title   50 non-null     int64 
 7   age           50 non-null     int64 
dtypes: int64(4), object(4)
memory usage: 3.5+ KB


In [29]:
X_explore['fuel_type'].value_counts()

fuel_type
Gasoline    38
Hybrid       5
Diesel       3
–            1
Name: count, dtype: int64

In [30]:
y_train.head()

3944     $32,750
3269     $15,000
238      $49,750
2495     $65,000
2552    $110,000
Name: price, dtype: object

In [31]:
y_train[:5].copy().apply(lambda x: x.lstrip('$').replace(',', '_')).astype(float)

3944     32750.0
3269     15000.0
238      49750.0
2495     65000.0
2552    110000.0
Name: price, dtype: float64

In [32]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2565 entries, 3944 to 1263
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         2565 non-null   object
 1   model         2565 non-null   object
 2   model_year    2565 non-null   int64 
 3   milage        2565 non-null   object
 4   fuel_type     2463 non-null   object
 5   engine        2565 non-null   object
 6   transmission  2565 non-null   object
 7   ext_col       2565 non-null   object
 8   int_col       2565 non-null   object
 9   accident      2498 non-null   object
 10  clean_title   2183 non-null   object
dtypes: int64(1), object(10)
memory usage: 240.5+ KB


In [33]:
X_train.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
3944,Volkswagen,Beetle 2.0T Final Edition SE,2019,"32,000 mi.",Gasoline,174.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes
3269,Subaru,Impreza WRX,2012,"106,000 mi.",Gasoline,265.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,M/T,Black,Black,None reported,Yes
238,Porsche,Cayman S,2014,"33,088 mi.",Gasoline,325.0HP 3.4L Flat 6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Black,None reported,Yes
2495,Acura,TLX Type S PMC Edition,2023,"13,600 mi.",Gasoline,355.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,White,None reported,Yes
2552,Mercedes-Benz,AMG GT 53 Base,2022,"2,235 mi.",Hybrid,429.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Red,None reported,Yes


In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

In [35]:
def age_feature_name(function_transformer, feature_names_in):
    return ['car_age']
    
def milage_feature_name(function_transformer, feature_names_in): 
    return feature_names_in 

def accident_feature_name(function_transformer, feature_names_in): 
    return feature_names_in

def clean_title_feature_name(function_transformer, feature_names_in): 
    return feature_names_in

def car_age(X: pd.DataFrame):
    return (datetime.now().year - X.iloc[:, 0]).to_frame()

def car_milage(X: pd.DataFrame): 
    return X.iloc[:, 0].astype(str).str.strip('mi.').str.replace(',', '').astype(int).to_frame()

def car_accident(X: pd.DataFrame): 
    return (X.iloc[:, 0] == 'None reported').astype(int).to_frame()

def car_title(X: pd.DataFrame): 
    return (X.iloc[:, 0] == 'Yes').astype(int).to_frame()

In [36]:
def label_preprocessor(X):
    return X.apply(lambda x: x.lstrip('$').replace(',', '_')).astype(float)

age_calculator = FunctionTransformer(
    func=car_age, 
    feature_names_out=age_feature_name
)
milage_transformer = FunctionTransformer(
    func=car_milage , 
    feature_names_out=milage_feature_name
)
accident_transformer = FunctionTransformer(
    func=car_accident, 
    feature_names_out=accident_feature_name
)
clean_title_transformer = FunctionTransformer(
    func=car_title, 
    feature_names_out=clean_title_feature_name
)

ohe_col = ['brand', 'fuel_type', 'transmission']

In [37]:
ohe_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False))
])

age_pipeline = Pipeline(steps=[
    ('age_transform', age_calculator), 
    ('imputer', SimpleImputer(strategy='mean'))
])

In [38]:
preprocessor = ColumnTransformer(transformers=[
    ('car_age', age_pipeline, ['model_year']), 
    ('car_mileage', milage_transformer, ['milage']), 
    ('accident_record', accident_transformer, ['accident']), 
    ('car_clean_title', clean_title_transformer, ['clean_title']),
    ('ohe', ohe_pipeline, ohe_col)
], remainder='drop')

preprocessor

0,1,2
,transformers,"[('car_age', ...), ('car_mileage', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function car...001FEDA29CB80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function age...001FEDA29C900>
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function car...001FEDA29CC20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function mil...001FEDA29C9A0>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function car...001FEDA29CCC0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function acc...001FEDA29CA40>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function car...001FEDA29CD60>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function cle...001FEDA29CAE0>
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'infrequent_if_exist'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [39]:
X_train_prepared = preprocessor.fit_transform(X_train)
X_valid_prepared = preprocessor.transform(X_valid)

In [40]:
y_train_prepared = label_preprocessor(y_train)
y_valid_prepared = label_preprocessor(y_valid)

In [41]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer

In [42]:
def model_average_score(estimator, X, y):
    return np.mean(cross_val_score(estimator, X, y, n_jobs=-1, scoring=make_scorer(mean_absolute_error)))

In [43]:
rf_avg_score = model_average_score(RandomForestRegressor(random_state=37), X_train_prepared, y_train_prepared)
svr_avg_score = model_average_score(SVR(), X_train_prepared, y_train_prepared)
dtr_avg_score = model_average_score(DecisionTreeRegressor(random_state=37), X_train_prepared, y_train_prepared)
lr_avg_score = model_average_score(LinearRegression(), X_train_prepared, y_train_prepared)

In [44]:
print(f'Random Forest: {rf_avg_score}\n\
        SVR: {svr_avg_score}\n\
        Decision Tree: {dtr_avg_score}\n\
        Logistic Regression: {lr_avg_score}')

Random Forest: 17174.412669451405
        SVR: 27250.01853602769
        Decision Tree: 20918.420272904485
        Logistic Regression: 20738.512281573807


In [45]:
param_grid = {
    'n_estimators': np.arange(100, 300, 20)
}

In [46]:
search_forest = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_grid, cv=3, n_jobs=-1, random_state=37)

In [47]:
search_forest.fit(X_train_prepared, y_train_prepared)

0,1,2
,estimator,RandomForestRegressor()
,param_distributions,"{'n_estimators': array([100, 1...40, 260, 280])}"
,n_iter,10
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,37

0,1,2
,n_estimators,np.int64(180)
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [48]:
search_forest.best_params_

{'n_estimators': np.int64(180)}

In [49]:
predictions = search_forest.predict(X_valid_prepared)

In [50]:
print(f'Mean Absolute Error: {mean_absolute_error(y_valid_prepared, predictions)}\n\
        R2 Score: {r2_score(y_valid_prepared, predictions)}')

Mean Absolute Error: 15104.755076727819
        R2 Score: 0.2430310548167316


In [51]:
final_model = Pipeline(steps=[
    ('preprocess', preprocessor), 
    ('rdf_reg', RandomForestRegressor(random_state=37, n_jobs=-1))
])

In [52]:
y_train_full_prepared = label_preprocessor(y_train_full)

In [53]:
final_model.fit(X_train_full, y_train_full_prepared)

0,1,2
,steps,"[('preprocess', ...), ('rdf_reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('car_age', ...), ('car_mileage', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function car...001FEDA29CB80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function age...001FEDA29C900>
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function car...001FEDA29CC20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function mil...001FEDA29C9A0>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function car...001FEDA29CCC0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function acc...001FEDA29CA40>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function car...001FEDA29CD60>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function cle...001FEDA29CAE0>
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'infrequent_if_exist'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [54]:
fm_predictions = final_model.predict(X_test)

In [55]:
print(f'Mean Absolute Error: {mean_absolute_error(label_preprocessor(y_test), fm_predictions)}\n\
        R2 Score: {r2_score(label_preprocessor(y_test), fm_predictions)}')

Mean Absolute Error: 15263.365510598504
        R2 Score: 0.6431442599212738


In [56]:
import joblib

In [57]:
joblib.dump( final_model, 'used_car_price_predictor.pkl')

['used_car_price_predictor.pkl']