In [248]:
import numpy as np
import pandas as pd
import mlflow
import dagshub

In [249]:
dagshub.init(repo_owner='akshatsharma2407', repo_name='GMC_motors', mlflow=True)

mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/GMC_motors.mlflow')

In [250]:
mlflow.autolog()
mlflow.set_experiment(experiment_name='GMC_exp1')
mlflow.start_run()

2025/03/09 18:44:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/03/09 18:44:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2025/03/09 18:44:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/03/09 18:44:18 INFO mlflow.tracking.fluent: Experiment with name 'GMC_exp1' does not exist. Creating a new experiment.


<ActiveRun: >

In [251]:
df = pd.read_csv('C:/Users/aksha/Downloads/CLEANED_GMC_DIESEL.csv')

In [252]:
df.drop(columns=['PRICE RANGE','MAKE ORIGIN','PARENT COMPANY','IMAGE','BRAND'],inplace=True)

In [253]:
df['AGE OF CAR'] = df['AGE OF CAR'].astype(str)
df['MODEL'] = df['MODEL'].astype(str)

In [254]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9929 entries, 0 to 9928
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CAR NAME                 9929 non-null   object 
 1   MODEL/CLASS              9929 non-null   object 
 2   MODEL                    9929 non-null   object 
 3   PRICE($)                 9929 non-null   float64
 4   STOCK TYPE               9929 non-null   object 
 5   MILEAGE                  9929 non-null   int64  
 6   AGE OF CAR               9929 non-null   object 
 7   RATING                   9929 non-null   float64
 8   REVIEW                   9929 non-null   float64
 9   DEALER NAME              9929 non-null   object 
 10  DEALER LOCATION (CITY)   9929 non-null   object 
 11  DEALER LOCATION (STATE)  9928 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 931.0+ KB


In [255]:
df.dropna(inplace=True)

In [256]:
df.drop_duplicates(inplace=True)

In [257]:
df

Unnamed: 0,CAR NAME,MODEL/CLASS,MODEL,PRICE($),STOCK TYPE,MILEAGE,AGE OF CAR,RATING,REVIEW,DEALER NAME,DEALER LOCATION (CITY),DEALER LOCATION (STATE)
0,GMC Sierra 3500 Denali,Sierra 3500,2024,82648.0,New,0,0,3.1,507.0,Kunes Chevrolet GMC of Elkhorn,Elkhorn,Wisconsin
1,GMC Sierra 2500 SLE,Sierra 2500,2022,46989.0,Used,51587,2,4.7,1443.0,Kearns Motor Car Co.,Johnson Creek,Wisconsin
2,GMC Sierra 1500 Elevation,Sierra 1500,2024,57375.0,New,0,0,4.1,49.0,Lafayette GMC Cadillac,Lafayette,Indiana
3,GMC Sierra 1500 AT4X,Sierra 1500,2024,75787.0,New,0,0,4.3,9.0,Pilson Chevrolet Buick GMC,Clinton,Indiana
4,GMC Sierra 1500 Elevation,Sierra 1500,2021,36667.0,Used,41976,3,-1.0,8.0,"H&K Chevrolet, Inc.",Continental,Ohio
...,...,...,...,...,...,...,...,...,...,...,...,...
9922,GMC Sierra 1500 Base,Sierra 1500,2017,26989.0,Used,51537,7,4.2,880.0,Corwin Motors Kalispell,Kalispell,Montana
9923,GMC Sierra 2500 Base,Sierra 2500,2022,69198.0,Used,28975,2,4.4,1188.0,Motor City Buick GMC,Bakersfield,California
9924,GMC Sierra 3500 Base,Sierra 3500,2024,65368.0,New,0,0,4.4,1188.0,Motor City Buick GMC,Bakersfield,California
9926,GMC Sierra 1500 Elevation,Sierra 1500,2024,64579.0,New,0,0,4.5,743.0,Chevrolet GMC of Fairbanks,Fairbanks,Alaska


In [258]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

In [259]:
xtrain,xtest,ytrain,ytest = train_test_split(df.drop(columns=['PRICE($)']),df['PRICE($)'],random_state=42,test_size=0.2)

In [260]:
print(xtrain.shape)
print(xtest.shape)

(6719, 11)
(1680, 11)


In [261]:
ct1 = ColumnTransformer(
    [
        ('RatingImputer',SimpleImputer(missing_values=-1,strategy='mean'),['RATING']),
        ('OHE',OneHotEncoder(drop='first',sparse_output=False,min_frequency=5,handle_unknown='ignore'),['CAR NAME','MODEL/CLASS','DEALER NAME','DEALER LOCATION (CITY)','DEALER LOCATION (STATE)']),
        ('OE',OrdinalEncoder(categories=
                             [
                                 ["1937", "1951", "1952", "1966", "1968", "1977", "1979", "1984", "1986", "1987", 
        "1988", "1989", "1996", "1998", "1999", "2000", "2001", "2002", "2003", "2004", 
        "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", 
        "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
       ['Used','GMC Certified','New'],
       ["87", "73", "72", "58", "56", "47", "45", "40", "38", "37", "36", "35", "28", 
        "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16", "15", "14", 
        "13", "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "0"]
       ]
       ),['MODEL','STOCK TYPE','AGE OF CAR'])
    ],
    remainder='passthrough'
)

In [262]:
ct2 = ColumnTransformer(
    [
        ('stdscaler',StandardScaler(),slice(0,820))
    ]
)

In [263]:
pipe = Pipeline([
    ('ct1',ct1),
    ('ct2',ct2)
])

In [264]:
pipe.set_output(transform='pandas')
xtrain_trans = pipe.fit_transform(xtrain)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
"


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [266]:
param_grids = {
    'LinearRegression': {},
    'DecisionTreeRegressor': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'SVR': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 200],
        'max_features': ['auto', 'sqrt']
    }
}

In [267]:
from sklearn.model_selection import GridSearchCV

baseline_models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'SVR': SVR(),
    'RandomForestRegressor': RandomForestRegressor()
}

for name, model in baseline_models.items():
    print(f"Training {name}...")

    grid_search = GridSearchCV(
        model, 
        param_grids[name],
        cv=5, 
        scoring='r2',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(xtrain_trans, ytrain)

    print(f"Best params for {name}: {grid_search.best_params_}")
    print(f"Best R² score: {grid_search.best_score_}\n")


Training LinearRegression...
Fitting 5 folds for each of 1 candidates, totalling 5 fits


2025/03/09 18:45:13 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best params for LinearRegression: {}
Best R² score: 0.8617482576531474

Training DecisionTreeRegressor...
Fitting 5 folds for each of 9 candidates, totalling 45 fits


2025/03/09 18:45:50 INFO mlflow.sklearn.utils: Logging the 5 best runs, 4 runs will be omitted.


Best params for DecisionTreeRegressor: {'max_depth': 20, 'min_samples_split': 10}
Best R² score: 0.8673232668744288

Training SVR...
Fitting 5 folds for each of 6 candidates, totalling 30 fits


2025/03/09 18:52:58 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


Best params for SVR: {'C': 10, 'kernel': 'linear'}
Best R² score: 0.857584467622851

Training RandomForestRegressor...
Fitting 5 folds for each of 6 candidates, totalling 30 fits


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\s

Best params for RandomForestRegressor: {'max_features': 'sqrt', 'n_estimators': 200}
Best R² score: 0.8720759468228326



In [268]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [269]:
best_base_model = grid_search.best_estimator_

In [270]:
xtest_trans = pipe.transform(xtest)



In [271]:
ypred = best_base_model.predict(xtest_trans)

In [272]:
print(mean_absolute_error(ytest,ypred),
mean_squared_error(ytest,ypred),
r2_score(ytest,ypred))

4618.7083142433 47311103.83866583 0.8777634944881121


In [274]:
mlflow.end_run()

🏃 View run kindly-chimp-23 at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/1/runs/95b9bbb28ef347279506a874d9790381
🧪 View experiment at: https://dagshub.com/akshatsharma2407/GMC_motors.mlflow/#/experiments/1
