In [24]:
import numpy as np
import pandas as pd
import mlflow
import dagshub

In [25]:
dagshub.init(repo_owner='akshatsharma2407', repo_name='GMC_motors', mlflow=True)

mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/GMC_motors.mlflow')

In [26]:
mlflow.autolog()
mlflow.set_experiment(experiment_name='GMC_exp_OE_BINARY')
mlflow.start_run()

2025/03/09 20:44:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/03/09 20:44:34 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2025/03/09 20:44:34 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/03/09 20:44:35 INFO mlflow.tracking.fluent: Experiment with name 'GMC_exp_OE_BINARY' does not exist. Creating a new experiment.


<ActiveRun: >

In [27]:
df = pd.read_csv('C:/Users/aksha/Downloads/CLEANED_GMC_DIESEL.csv')

In [28]:
df.drop(columns=['PRICE RANGE','MAKE ORIGIN','PARENT COMPANY','IMAGE','BRAND'],inplace=True)

In [29]:
df['AGE OF CAR'] = df['AGE OF CAR'].astype(str)
df['MODEL'] = df['MODEL'].astype(str)

In [30]:
df.dropna(inplace=True)

In [31]:
df.drop_duplicates(inplace=True)

In [32]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

In [33]:
xtrain,xtest,ytrain,ytest = train_test_split(df.drop(columns=['PRICE($)']),df['PRICE($)'],random_state=42,test_size=0.2)

In [34]:
ct1 = ColumnTransformer(
    [
        ('RatingImputer',SimpleImputer(missing_values=-1,strategy='mean'),['RATING']),
        ('OHE',ce.BinaryEncoder(return_df=True),['CAR NAME','MODEL/CLASS','DEALER NAME','DEALER LOCATION (CITY)','DEALER LOCATION (STATE)']),
        ('OE',OrdinalEncoder(categories=
                             [
                                 ["1937", "1951", "1952", "1966", "1968", "1977", "1979", "1984", "1986", "1987", 
        "1988", "1989", "1996", "1998", "1999", "2000", "2001", "2002", "2003", "2004", 
        "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", 
        "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
       ['Used','GMC Certified','New'],
       ["87", "73", "72", "58", "56", "47", "45", "40", "38", "37", "36", "35", "28", 
        "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16", "15", "14", 
        "13", "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "0"]
       ]
       ),['MODEL','STOCK TYPE','AGE OF CAR'])
    ],
    remainder='passthrough'
)

In [35]:
ct2 = ColumnTransformer(
    [
        ('stdscaler',StandardScaler(),slice(0,820))
    ]
)

In [36]:
pipe = Pipeline([
    ('ct1',ct1),
    ('ct2',ct2)
])

In [None]:
pipe.set_output(transform='pandas')
xtrain_trans = pipe.fit_transform(xtrain)
xtest_trans = pipe.transform(xtest)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
"


🏃 View run brawny-hare-501 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/0490a595c0e548dc9834e5357d471fff
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run hilarious-bat-375 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/ce5a6a61f83240018b8127ac2aa58b5e
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run flawless-hound-905 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/4225eb866a7c404ea893742348ca7a17
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4


In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [39]:
param_grids = {
    'LinearRegression': {},
    'DecisionTreeRegressor': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'SVR': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 200],
        'max_features': ['auto', 'sqrt']
    }
}

In [40]:
#baseline models
models = {
    'LinearRegression' : LinearRegression(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'SVR' : SVR(),
    'RandomForestRegressor' : RandomForestRegressor()
}

In [41]:
for i,j in models.items():
    print('training ',i,'....')
    j.fit(xtrain_trans,ytrain)
    ypred = j.predict(xtest_trans)
    print('mean squared error',mean_squared_error(ytest,ypred))
    print('mean_absolute_error',mean_absolute_error(ytest,ypred))
    print('r2_score',r2_score(ytest,ypred))
    print('='*20)

    print('\n\n\n')

training  LinearRegression ....
mean squared error 99174815.28269999
mean_absolute_error 7511.66440932148
r2_score 0.7437645315509069




training  DecisionTreeRegressor ....
mean squared error 64693673.610774785
mean_absolute_error 5427.123606911162
r2_score 0.8328525874628853




training  SVR ....
mean squared error 386699069.7550315
mean_absolute_error 16156.59931456459
r2_score 0.0008953683950695401




training  RandomForestRegressor ....




mean squared error 47042847.00838783
mean_absolute_error 4561.039810484759
r2_score 0.8784565828934197






In [42]:
# hyperparameter tuning

for i,j in models.items():
    print('training ',i,'....')
    grid_search = GridSearchCV(j,param_grid=param_grids[i],cv=5,verbose=1,n_jobs=-1,scoring='r2')

    grid_search.fit(xtrain_trans,ytrain)

    print(f'best params for {i} is : ',grid_search.best_params_)
    print(f'best params for {j} is : ',grid_search.best_score_)

    print('='*20,'\n\n')

training  LinearRegression ....
Fitting 5 folds for each of 1 candidates, totalling 5 fits


2025/03/09 20:47:15 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


🏃 View run debonair-mink-682 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/16b844d2cb4a43d3b02d7649fd232c10
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4




best params for LinearRegression is :  {}
best params for LinearRegression() is :  0.7492441689274428


training  DecisionTreeRegressor ....
Fitting 5 folds for each of 9 candidates, totalling 45 fits


2025/03/09 20:47:42 INFO mlflow.sklearn.utils: Logging the 5 best runs, 4 runs will be omitted.


🏃 View run marvelous-whale-541 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/b8ceb444e1c64b54a08bbdb3864da922
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run nosy-quail-205 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/291b0bc9b2724bf8a3fdea38c1b77473
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run traveling-newt-186 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/75680925bc624d038571ce3e1d362418
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run sassy-stag-599 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/5984f8eb1cd04c65960e8e649d0377a0
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4




best params for DecisionTreeRegressor is :  {'max_depth': 10, 'min_samples_split': 10}
best params for DecisionTreeRegressor() is :  0.8569174266123394


training  SVR ....
Fitting 5 folds for each of 6 candidates, totalling 30 fits


2025/03/09 20:49:29 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


🏃 View run inquisitive-colt-279 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/fc7fb2a3d257469e9b9d11b60b1667d6
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run enthused-whale-261 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/ef48ec75b4a2469a99d2ba67d9c7dd49
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run shivering-sloth-765 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/d56c8b607dc94172a8671be35d3db0b0
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run amusing-fawn-277 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/cd1ba2764d8447d78394fdfa771b82d3
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4




best params for SVR is :  {'C': 10, 'kernel': 'linear'}
best params for SVR() is :  0.725715089479128


training  RandomForestRegressor ....
Fitting 5 folds for each of 6 candidates, totalling 30 fits


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\s

🏃 View run mysterious-snipe-120 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/bbd7a374d7cb4feea46ac28963cf82b8
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run enchanting-conch-957 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/8d6a0e3539bb4801b9e459c63af59f04
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run hilarious-hawk-748 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/3ea2001e150243fe919f499211c24e42
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
🏃 View run efficient-smelt-811 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/30428755947241219b4218186245dec1
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4




best params for RandomForestRegressor is :  {'max_features': 'sqrt', 'n_estimators': 200}
best params for RandomForestRegressor() is :  0.867517853506028




In [43]:
best_estimator = grid_search.best_estimator_

In [44]:
ypred = best_estimator.predict(xtest_trans)

In [45]:
print('mean squared error',mean_squared_error(ytest,ypred))
print('mean_absolute_error',mean_absolute_error(ytest,ypred))
print('r2_score',r2_score(ytest,ypred))

mean squared error 48689221.92369285
mean_absolute_error 4704.378175679433
r2_score 0.8742028855564142


In [46]:
mlflow.end_run()

🏃 View run handsome-dolphin-607 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4/runs/05e591fef49a45ae9f7a402e3dde067f
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/4
