In [48]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV,train_test_split
import dagshub
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [49]:
dagshub.init(repo_owner='akshatsharma2407', repo_name='GMC_motors', mlflow=True)

mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/GMC_motors.mlflow')

In [50]:
mlflow.autolog()
mlflow.set_experiment(experiment_name='RF_BEST_HP')
mlflow.start_run()

2025/03/11 11:08:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/03/11 11:08:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


<ActiveRun: >

In [51]:
df = pd.read_csv('C:/Users/aksha/Downloads/CLEANED_GMC_DIESEL.csv')

df.drop(columns=['PRICE RANGE','MAKE ORIGIN','PARENT COMPANY','IMAGE','BRAND'],inplace=True)

df['AGE OF CAR'] = df['AGE OF CAR'].astype(str)
df['MODEL'] = df['MODEL'].astype(str)

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

xtrain,xtest,ytrain,ytest = train_test_split(df.drop(columns='PRICE($)'),df['PRICE($)'],test_size=0.2)

In [52]:
ct1 = ColumnTransformer(
            [
                ('RatingImputer',SimpleImputer(missing_values=-1,strategy='mean'),['RATING']),
                ('OHE',ce.TargetEncoder(verbose=1),['CAR NAME','MODEL/CLASS','DEALER NAME','DEALER LOCATION (CITY)','DEALER LOCATION (STATE)']),
                ('OE',OrdinalEncoder(categories=
                                    [
                                        ["1937", "1951", "1952", "1966", "1968", "1977", "1979", "1984", "1986", "1987", 
                "1988", "1989", "1996", "1998", "1999", "2000", "2001", "2002", "2003", "2004", 
                "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", 
                "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
            ['Used','GMC Certified','New'],
            ["87", "73", "72", "58", "56", "47", "45", "40", "38", "37", "36", "35", "28", 
                "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16", "15", "14", 
                "13", "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "0"]
            ]
            ),['MODEL','STOCK TYPE','AGE OF CAR'])
            ],
            remainder='passthrough'
        )


ct2 = ColumnTransformer(
            [
                ('stdscaler',StandardScaler(),slice(0,13))
            ]
        )

In [53]:
pipe = Pipeline([
    ('ct1',ct1),
    ('ct2',ct2)
])

In [54]:
pipe.set_output(transform='pandas')
xtrain_trans = pipe.fit_transform(xtrain,ytrain)
xtest_trans = pipe.transform(xtest)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
"


In [55]:
mlflow.sklearn.log_model(pipe, "pipeline_model")



<mlflow.models.model.ModelInfo at 0x20b86135a00>

In [56]:
rf_hyperparams = {
    "n_estimators": [50, 100, 200, 500],  
    "max_depth": [None, 10, 20, 30, 50], 
    "min_samples_split": [2, 5, 10], 
    "min_samples_leaf": [1, 2, 4, 8], 
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2],  
    "max_features": ["sqrt", "log2", None], 
    "max_leaf_nodes": [None, 10, 50, 100],  
    "min_impurity_decrease": [0.0, 0.01, 0.05], 
    "bootstrap": [True, False],  
    "ccp_alpha": [0.0, 0.01, 0.1]
}


In [57]:
model = RandomForestRegressor()

random_search = RandomizedSearchCV(model,param_distributions=rf_hyperparams,cv=2,verbose=1,scoring='r2')

random_search.fit(xtrain_trans,ytrain)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


2025/03/11 11:09:10 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


In [58]:
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_weight_fraction_leaf,param_min_samples_split,param_min_samples_leaf,param_min_impurity_decrease,param_max_leaf_nodes,param_max_features,param_max_depth,param_ccp_alpha,param_bootstrap,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.623173,0.009929,0.08389,0.000785,500,0.2,2,4,0.0,10.0,sqrt,,0.1,True,"{'n_estimators': 500, 'min_weight_fraction_lea...",0.644323,0.634215,0.639269,0.005054,8
1,0.238389,0.00206,0.033878,0.000103,200,0.2,5,4,0.01,,log2,10.0,0.1,True,"{'n_estimators': 200, 'min_weight_fraction_lea...",0.663836,0.633946,0.648891,0.014945,6
2,0.272272,0.001724,0.037503,0.000414,200,0.1,10,4,0.0,,log2,10.0,0.01,False,"{'n_estimators': 200, 'min_weight_fraction_lea...",0.753917,0.733127,0.743522,0.010395,4
3,0.235664,0.003646,0.033252,0.000254,200,0.2,5,2,0.0,10.0,sqrt,,0.0,True,"{'n_estimators': 200, 'min_weight_fraction_lea...",0.66325,0.635023,0.649136,0.014114,5
4,0.066306,0.000491,0.011462,0.000459,50,0.1,5,1,0.01,,log2,50.0,0.0,False,"{'n_estimators': 50, 'min_weight_fraction_leaf...",0.752761,0.736983,0.744872,0.007889,3
5,0.109848,0.002973,0.017044,4.5e-05,100,0.2,5,1,0.0,100.0,log2,20.0,0.01,False,"{'n_estimators': 100, 'min_weight_fraction_lea...",0.63086,0.621036,0.625948,0.004912,9
6,1.587255,0.054935,0.111522,0.064527,200,0.0,5,2,0.05,50.0,,10.0,0.1,False,"{'n_estimators': 200, 'min_weight_fraction_lea...",0.855691,0.866834,0.861263,0.005572,2
7,0.078771,0.007166,0.013622,0.001378,50,0.2,2,1,0.05,50.0,sqrt,30.0,0.0,False,"{'n_estimators': 50, 'min_weight_fraction_leaf...",0.614365,0.613629,0.613997,0.000368,10
8,1.431787,0.023345,0.132079,0.002275,500,0.0,5,4,0.0,50.0,sqrt,20.0,0.01,True,"{'n_estimators': 500, 'min_weight_fraction_lea...",0.861905,0.860912,0.861408,0.000497,1
9,0.060905,0.000793,0.009493,0.000506,50,0.2,2,8,0.01,100.0,log2,50.0,0.1,True,"{'n_estimators': 50, 'min_weight_fraction_leaf...",0.656035,0.641543,0.648789,0.007246,7


In [59]:
best_model = random_search.best_estimator_

ypred = best_model.predict(xtest_trans)

print('mean squared error',mean_squared_error(ytest,ypred))
print('mean_absolute_error',mean_absolute_error(ytest,ypred))
print('r2_score',r2_score(ytest,ypred))

mean squared error 61941366.07610565
mean_absolute_error 5876.733336783799
r2_score 0.8441418650715868


In [60]:
mlflow.end_run()

🏃 View run incongruous-ox-941 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/6/runs/67b05a32212b42f0af2bae36b828e09f
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/6
