In [1]:
import numpy as np
import pandas as pd
import mlflow
import dagshub

In [2]:
dagshub.init(repo_owner='akshatsharma2407', repo_name='GMC_motors', mlflow=True)

mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/GMC_motors.mlflow')

In [None]:
mlflow.autolog()
mlflow.set_experiment(experiment_name='GMC_exp_OE_BINARY')
mlflow.start_run()

In [3]:
df = pd.read_csv('C:/Users/aksha/Downloads/CLEANED_GMC_DIESEL.csv')

In [4]:
df.drop(columns=['PRICE RANGE','MAKE ORIGIN','PARENT COMPANY','IMAGE','BRAND'],inplace=True)

In [5]:
df['AGE OF CAR'] = df['AGE OF CAR'].astype(str)
df['MODEL'] = df['MODEL'].astype(str)

In [6]:
df.dropna(inplace=True)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

In [9]:
xtrain,xtest,ytrain,ytest = train_test_split(df.drop(columns=['PRICE($)']),df['PRICE($)'],random_state=42,test_size=0.2)

In [10]:
ct1 = ColumnTransformer(
    [
        ('RatingImputer',SimpleImputer(missing_values=-1,strategy='mean'),['RATING']),
        ('OHE',ce.BinaryEncoder(return_df=True),['CAR NAME','MODEL/CLASS','DEALER NAME','DEALER LOCATION (CITY)','DEALER LOCATION (STATE)']),
        ('OE',OrdinalEncoder(categories=
                             [
                                 ["1937", "1951", "1952", "1966", "1968", "1977", "1979", "1984", "1986", "1987", 
        "1988", "1989", "1996", "1998", "1999", "2000", "2001", "2002", "2003", "2004", 
        "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", 
        "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
       ['Used','GMC Certified','New'],
       ["87", "73", "72", "58", "56", "47", "45", "40", "38", "37", "36", "35", "28", 
        "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16", "15", "14", 
        "13", "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "0"]
       ]
       ),['MODEL','STOCK TYPE','AGE OF CAR'])
    ],
    remainder='passthrough'
)

In [11]:
ct2 = ColumnTransformer(
    [
        ('stdscaler',StandardScaler(),slice(0,820))
    ]
)

In [12]:
pipe = Pipeline([
    ('ct1',ct1),
    ('ct2',ct2)
])

In [15]:
pipe.set_output(transform='pandas')
xtrain_trans = pipe.fit_transform(xtrain)
xtest_trans = pipe.transform(xtest)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [17]:
param_grids = {
    'LinearRegression': {},
    'DecisionTreeRegressor': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'SVR': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 200],
        'max_features': ['auto', 'sqrt']
    }
}

In [18]:
#baseline models
models = {
    'LinearRegression' : LinearRegression(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'SVR' : SVR(),
    'RandomForestRegressor' : RandomForestRegressor()
}

In [19]:
for i,j in models.items():
    print('training ',i,'....')
    j.fit(xtrain_trans,ytrain)
    ypred = j.predict(xtest_trans)
    print('mean squared error',mean_squared_error(ytest,ypred))
    print('mean_absolute_error',mean_absolute_error(ytest,ypred))
    print('r2_score',r2_score(ytest,ypred))
    print('='*20)

    print('\n\n\n')

training  LinearRegression ....
mean squared error 99174815.28269999
mean_absolute_error 7511.66440932148
r2_score 0.7437645315509069




training  DecisionTreeRegressor ....
mean squared error 64667025.88151421
mean_absolute_error 5404.709212083982
r2_score 0.8329214365287567




training  SVR ....
mean squared error 386699069.7550315
mean_absolute_error 16156.59931456459
r2_score 0.0008953683950695401




training  RandomForestRegressor ....
mean squared error 47298171.10587973
mean_absolute_error 4567.7046396366695
r2_score 0.8777969084635687






In [20]:
# hyperparameter tuning

for i,j in models.items():
    print('training ',i,'....')
    grid_search = GridSearchCV(j,param_grid=param_grids[i],cv=5,verbose=1,n_jobs=-1,scoring='r2')

    grid_search.fit(xtrain_trans,ytrain)

    print(f'best params for {i} is : ',grid_search.best_params_)
    print(f'best params for {j} is : ',grid_search.best_score_)

    print('='*20,'\n\n')

training  LinearRegression ....
Fitting 5 folds for each of 1 candidates, totalling 5 fits
best params for LinearRegression is :  {}
best params for LinearRegression() is :  0.7492441689274428


training  DecisionTreeRegressor ....
Fitting 5 folds for each of 9 candidates, totalling 45 fits
best params for DecisionTreeRegressor is :  {'max_depth': 10, 'min_samples_split': 10}
best params for DecisionTreeRegressor() is :  0.8573591883882962


training  SVR ....
Fitting 5 folds for each of 6 candidates, totalling 30 fits
best params for SVR is :  {'C': 10, 'kernel': 'linear'}
best params for SVR() is :  0.725715089479128


training  RandomForestRegressor ....
Fitting 5 folds for each of 6 candidates, totalling 30 fits


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\aksha\AppData\Local\Programs\Python\Python312\Lib\

best params for RandomForestRegressor is :  {'max_features': 'sqrt', 'n_estimators': 200}
best params for RandomForestRegressor() is :  0.8676133820147115




In [21]:
best_estimator = grid_search.best_estimator_

In [22]:
ypred = best_estimator.predict(xtest_trans)

In [23]:
print('mean squared error',mean_squared_error(ytest,ypred))
print('mean_absolute_error',mean_absolute_error(ytest,ypred))
print('r2_score',r2_score(ytest,ypred))

mean squared error 48832930.62228638
mean_absolute_error 4711.191911513683
r2_score 0.8738315890170725


In [None]:
mlflow.end_run()