In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import dagshub
import mlflow

In [2]:
dagshub.init(repo_owner='akshatsharma2407', repo_name='GMC_motors', mlflow=True)

mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/GMC_motors.mlflow')

In [3]:
mlflow.autolog()
mlflow.set_experiment(experiment_name='Random_forest_best_HP')
mlflow.start_run()

2025/03/11 10:16:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


<ActiveRun: >

In [4]:
data = pd.read_csv('C:/Users/aksha/OneDrive/Desktop/GMC_MLOPS/data/processed/train_processed_df.csv')

xtrain = data.drop(columns='Price($)')
ytrain = data['Price($)']

In [5]:
rf_hyperparams = {
    "n_estimators": [50, 100, 200, 500],  
    "max_depth": [None, 10, 20, 30, 50], 
    "min_samples_split": [2, 5, 10], 
    "min_samples_leaf": [1, 2, 4, 8], 
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2],  
    "max_features": ["sqrt", "log2", None], 
    "max_leaf_nodes": [None, 10, 50, 100],  
    "min_impurity_decrease": [0.0, 0.01, 0.05], 
    "bootstrap": [True, False],  
    "ccp_alpha": [0.0, 0.01, 0.1]
}


In [6]:
model = RandomForestRegressor()

random_search = RandomizedSearchCV(model,param_distributions=rf_hyperparams,cv=2,verbose=1,scoring='r2')

random_search.fit(xtrain,ytrain)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


2025/03/11 10:18:28 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


🏃 View run debonair-duck-983 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5/runs/ca040519318a4426a9b9bf23c3b7a1a1
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5
🏃 View run blushing-shrew-278 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5/runs/f6b6af1516e547078e12d99c35975a47
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5
🏃 View run brawny-grouse-648 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5/runs/6b80ef870ee14935992194ed64b81e96
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5
🏃 View run traveling-moth-56 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5/runs/2356c91f64fd434eadc2182f9b63b434
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5
🏃 View run crawling-sloth-917 at: https://dagshub.com/aksha

In [7]:
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_weight_fraction_leaf,param_min_samples_split,param_min_samples_leaf,param_min_impurity_decrease,param_max_leaf_nodes,param_max_features,param_max_depth,param_ccp_alpha,param_bootstrap,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.209752,0.007367,0.040674,0.000775,100,0.1,2,1,0.01,,sqrt,30,0.1,False,"{'n_estimators': 100, 'min_weight_fraction_lea...",0.733263,0.750887,0.742075,0.008812,6
1,1.11464,0.027616,0.160316,0.002386,500,0.1,10,4,0.05,100.0,log2,30,0.1,False,"{'n_estimators': 500, 'min_weight_fraction_lea...",0.750733,0.74743,0.749081,0.001652,4
2,0.088934,0.001804,0.015274,0.001648,50,0.2,5,1,0.05,50.0,log2,50,0.1,True,"{'n_estimators': 50, 'min_weight_fraction_leaf...",0.639788,0.632518,0.636153,0.003635,8
3,0.226377,0.015044,0.034034,0.005034,100,0.1,5,2,0.05,100.0,log2,50,0.1,True,"{'n_estimators': 100, 'min_weight_fraction_lea...",0.751803,0.744264,0.748034,0.003769,5
4,1.074044,0.021784,0.143671,0.005443,500,0.1,2,4,0.0,100.0,log2,20,0.0,True,"{'n_estimators': 500, 'min_weight_fraction_lea...",0.748305,0.753267,0.750786,0.002481,3
5,1.366304,0.146272,0.064095,0.003364,200,0.0,5,4,0.05,10.0,,30,0.1,True,"{'n_estimators': 200, 'min_weight_fraction_lea...",0.833532,0.838706,0.836119,0.002587,1
6,2.105185,0.095495,0.138934,0.00064,500,0.1,5,8,0.01,10.0,,30,0.0,True,"{'n_estimators': 500, 'min_weight_fraction_lea...",0.784654,0.800491,0.792573,0.007918,2
7,0.994018,0.213716,0.124848,0.002238,500,0.2,10,8,0.01,50.0,log2,10,0.1,False,"{'n_estimators': 500, 'min_weight_fraction_lea...",0.637034,0.630279,0.633656,0.003377,10
8,0.343321,0.021267,0.0549,0.006665,200,0.2,2,2,0.05,100.0,sqrt,10,0.01,False,"{'n_estimators': 200, 'min_weight_fraction_lea...",0.638449,0.63329,0.63587,0.002579,9
9,0.692738,0.005834,0.062892,0.000521,200,0.2,5,8,0.01,100.0,,30,0.0,True,"{'n_estimators': 200, 'min_weight_fraction_lea...",0.701649,0.720123,0.710886,0.009237,7


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd

# Load data
test_df = pd.read_csv('C:/Users/aksha/OneDrive/Desktop/GMC_MLOPS/data/processed/test_processed_df.csv')

xtest = test_df.drop(columns='Price($)')
ytest = test_df['Price($)']

# Define model
model = RandomForestRegressor()

# Apply cross-validation
cv_scores = cross_val_score(model, xtest, ytest, cv=5, scoring='r2')
mae_scores = cross_val_score(model, xtest, ytest, cv=5, scoring='neg_mean_absolute_error')

print(f'Cross-validated R2 scores: {cv_scores}')
print(f'Mean R2 score: {cv_scores.mean()}')

print(f'Cross-validated MAE scores: {-mae_scores}')
print(f'Mean MAE score: {-mae_scores.mean()}')


Cross-validated R2 scores: [0.87242532 0.85768224 0.84821779 0.87825897 0.87222614]
Mean R2 score: 0.865762094456608
Cross-validated MAE scores: [4802.99143199 5188.67331286 5002.3030525  4954.65069987 4933.41247682]
Mean MAE score: 4976.406194807047


In [10]:
mlflow.end_run()

🏃 View run adaptable-pug-724 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5/runs/5113aa4c43d34275b2a30f03002531a9
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/5


In [13]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}