In [29]:
import pandas as pd
import numpy as np
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
# loading data

df = pd.read_csv('../reports/In-Search-of-The-Champion.csv')
train = pd.read_parquet('../data/Exp/train.parquet')
validation = pd.read_parquet('../data/Exp/test.parquet')

In [None]:
xtrain = train.drop(columns='Price')
ytrain = train['Price'].copy()

xvalidation = validation.drop(columns='Price')
yvalidation = validation['Price'].copy()

In [6]:
pd.set_option('display.max_columns', None)

In [None]:
# removing the trails where r2 is less than 0.85 and model is decisionTree
# Also removing the Target encoder, because there is high chance of data leakage

df = df[(df['Value'] > 0.85) & (df['Param encoder_type'] != 'Target') & (df['Param model'] != 'DecisionTree')].sort_values(by='Value', ascending=False).drop(columns=[col for col in df.columns if 'dt_' in col])

In [None]:
# we are left with several RF and only 1 XGBoost model
# taking the trail number 39 as our final model

(df.drop(columns=[col for col in df.columns if 'xgb' in col])
.sort_values(by=['Param rf_n_estimators','Param rf_min_samples_leaf','Param rf_min_samples_split'],ascending=[True,False,False]))

Unnamed: 0,Number,State,Value,Param encoder_type,Param model,Param rf_bootstrap,Param rf_max_depth,Param rf_max_features,Param rf_min_samples_leaf,Param rf_min_samples_split,Param rf_n_estimators
39,39,COMPLETE,0.910706,Freq,RandomForest,True,30.0,,2.0,8.0,100.0
9,9,COMPLETE,0.896934,Count,RandomForest,True,15.0,log2,3.0,3.0,200.0
46,46,COMPLETE,0.923029,Count,RandomForest,False,47.0,sqrt,1.0,6.0,200.0
34,34,COMPLETE,0.921676,Freq,RandomForest,False,36.0,sqrt,1.0,9.0,250.0
57,57,COMPLETE,0.90818,Count,RandomForest,True,44.0,,1.0,9.0,250.0
36,36,COMPLETE,0.918221,Freq,RandomForest,False,27.0,sqrt,2.0,10.0,300.0
47,47,COMPLETE,0.92312,Freq,RandomForest,False,46.0,sqrt,1.0,6.0,300.0
25,25,COMPLETE,0.918032,Count,RandomForest,False,50.0,sqrt,2.0,10.0,350.0
8,8,COMPLETE,0.909246,Count,RandomForest,True,44.0,,1.0,8.0,350.0
1,1,COMPLETE,0.880722,Freq,RandomForest,False,10.0,sqrt,3.0,6.0,500.0


In [None]:
# getting more sure whether it is generalizing or not

transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('frequency',
             CountFrequencyEncoder(encoding_method='frequency'),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ],remainder='passthrough')

model = RandomForestRegressor(
            n_estimators=100,
            max_depth=30,
            bootstrap=True,
            max_features=None,
            min_samples_leaf=2,
            random_state=42,
            min_samples_split=8
        )

pipe = Pipeline(
    [
        ('transformer',transformer),
        ('model',model)
    ]
)

In [None]:
scoring = {
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'mse': 'neg_mean_squared_error'
    }

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(pipe,xtrain,ytrain,scoring=scoring,verbose=13,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.4min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  2.4min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.4min finished


In [None]:
# all metrics are more or less showing that model is stable in prediction, showing that model has generalized and ready to be deployed
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_r2,test_mae,test_mse
0,136.034388,1.99869,0.886847,-2665.554114,-71682860.0
1,136.534313,1.499766,0.929083,-2678.050528,-46440360.0
2,137.516283,1.32473,0.932857,-2660.376118,-39792190.0
3,135.926151,2.120445,0.93793,-2622.414812,-41946510.0
4,136.68414,1.486933,0.870449,-2759.864659,-104803800.0


In [20]:
pipe.fit(xtrain,ytrain)

0,1,2
,steps,"[('transformer', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ordinal_encoding', ...), ('frequency', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['New', 'Certified', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,encoding_method,'frequency'
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,30
,min_samples_split,8
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
ypred = pipe.predict(xvalidation)



In [None]:
mae = mean_absolute_error(yvalidation,ypred)
mse = mean_squared_error(yvalidation,ypred)
rmse = root_mean_squared_error(yvalidation,ypred)

In [26]:
print('MAE on Validation set : ', mae)
print('MSE on Validation set : ', mse)
print('RMSE on Validation set : ',rmse)

MAE on Validation set :  2599.611624340572
MSE on Validation set :  33219770.525992308
RMSE on Validation set :  5763.659473458882


In [None]:
# random unseen data from cars.com

p = pd.DataFrame(np.array([2022, 50603, 'Honda', 'Civic Sport', 'Used',
       'silver', 'black', 'FWD',15 , 'Gasoline', 0,
       1, 0, 1, 0.0, 0.0,
       0.0, 0.0, 8, 2,
       'I4', 16, 0.0, 0.0, 'Asheville',
       'North California']).reshape(1,-1),columns=xtrain.columns)
p

Unnamed: 0,Model_Year,Mileage,Brand_Name,Model_Name,Stock_Type,Exterior_Color,Interior_Color,Drivetrain,Km/L,Fuel_Type,Accidents_Or_Damage,Clean_Title,One_Owner_Vehicle,Personal_Use_Only,Level2_Charging,Dc_Fast_Charging,Battery_Capacity,Expected_Range,Gear_Spec,Engine_Size,Cylinder_Config,Valves,Km/L_e_City,Km/L_e_Hwy,City,STATE
0,2022,50603,Honda,Civic Sport,Used,silver,black,FWD,15,Gasoline,0,1,0,1,0.0,0.0,0.0,0.0,8,2,I4,16,0.0,0.0,Asheville,North California


In [None]:
pipe.predict(p) # prediction is very close (actual price is 22k)



array([21720.61364025])