In [71]:
import pandas as pd
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [None]:
df = pd.read_parquet("C:/Users/aksha/OneDrive/Desktop/CARS-FINAL_YEAR_PROJECT/DATA/04_After_data_preprocessing/CLEANED_IMPUTED_PREPROCESSED.parquet").drop(columns='Image_List')
pd.set_option('display.max_columns', None)

In [18]:
df.head()  

Unnamed: 0,Model_Year,Mileage,Brand_Name,Model_Name,Stock_Type,Price,Exterior_Color,Interior_Color,Drivetrain,Km/L,Fuel_Type,Accidents_Or_Damage,Clean_Title,One_Owner_Vehicle,Personal_Use_Only,Level2_Charging,Dc_Fast_Charging,Battery_Capacity,Expected_Range,Gear_Spec,Engine_Size,Cylinder_Config,Valves,Km/L_e_City,Km/L_e_Hwy,City,STATE
0,2022,27565,Ford,F-150 Lightning Platinum,Used,48500.0,gray,gray,AWD,0.0,Electric,False,True,True,False,19.0,41.0,131.0,466.6,1,0.0,,0,31.0,26.0,gower,missouri
2,2014,117487,Ford,F-150 STX,Used,16998.0,black,black,4WD,5.0,Flex Fuel,True,True,False,True,0.0,0.0,0.0,0.0,6,5.0,V8,32,0.0,0.0,cortland,ohio
3,2023,63052,Chevrolet,Tahoe 4WD Z71,Used,54495.0,black,black,4WD,7.0,Gasoline,False,True,True,True,0.0,0.0,0.0,0.0,10,6.2,V8,16,0.0,0.0,granbury,texas
4,2024,9531,Audi,Q5 45 S line quattro Premium,Certified,40204.0,gray,black,AWD,11.0,Gasoline,False,True,False,False,0.0,0.0,0.0,0.0,7,2.0,I4,16,0.0,0.0,knoxville,tennessee
5,2017,84516,Toyota,Highlander Limited,Used,21950.0,white,yellow,FWD,4.0,Gasoline,False,True,False,True,0.0,0.0,0.0,0.0,8,3.5,V6,24,0.0,0.0,albuquerque,new mexico


In [46]:
cat_cols = ['Brand_Name','Model_Name','Exterior_Color','Interior_Color',
            'Drivetrain','Fuel_Type','Cylinder_Config','City','STATE','Stock_Type']

df[cat_cols] = df[cat_cols].astype('object')

In [None]:
# dividing data into train, validation and test

train, temp = train_test_split(df,test_size=0.2,random_state=42,shuffle=True)
validation, test = train_test_split(temp, test_size=0.4, random_state=42)

In [None]:
xtrain = train.drop(columns=['Price'])
ytrain = train['Price'].copy()

In [69]:
xvalidation = validation.drop(columns='Price')
yvalidation = validation['Price'].copy()

In [None]:
# saving data for future use

train.to_parquet('train.parquet')
validation.to_parquet('validation.parquet')
test.to_parquet('test.parquet')

In [None]:
# encoding of strings

transformer = ColumnTransformer(
    [
        ('ordinal_encoding',
         OrdinalEncoder(categories=[['New','Certified','Used']]),
         ['Stock_Type']),

       ('count_encoder',
        CountFrequencyEncoder(encoding_method='count'),
        ['Brand_Name','Model_Name','Exterior_Color',
         'Interior_Color','Drivetrain','Fuel_Type',
         'Cylinder_Config','City','STATE'])
    ],
    remainder='passthrough'
)
transformer.set_output(transform='pandas')

0,1,2
,transformers,"[('ordinal_encoding', ...), ('count_encoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['New', 'Certified', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,encoding_method,'count'
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'


In [None]:
# encoding + model 

pipeline = Pipeline(
    [
        ('preprocessor',transformer),
        ('model', LinearRegression())
    ]
)

param_grid = [
    {'model' : [RandomForestRegressor()]},
    {'model' : [DecisionTreeRegressor()]},
    {'model' : [XGBRegressor()]},
    {'model' : [SVR()]},
    {'model' : [LinearRegression()]}
]

In [None]:
# tracking the experimentation

mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow')
dagshub.init(repo_owner='akshatsharma2407', repo_name='AutoNexusMlOps', mlflow=True)

mlflow.sklearn.autolog()
mlflow.set_experiment('Base Models')

with mlflow.start_run(run_name='all models',nested=True) as parent:
    grid_search = GridSearchCV(estimator=pipeline,param_grid=param_grid,
                               cv=3,scoring='neg_root_mean_squared_error',
                               n_jobs=-1,verbose=2)
    grid_search.fit(xtrain,ytrain)

with mlflow.start_run(run_name='best model') as best:
    signature = mlflow.models.infer_signature(model_input=xtrain.head(),model_output=grid_search.best_estimator_.predict(xtrain))
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric('rmse',grid_search.best_score_)
    mlflow.sklearn.log_model(grid_search.best_estimator_, 'model', signature=signature)
    
    y_train_pred = grid_search.best_estimator_.predict(xtrain)
    y_validate_pred = grid_search.best_estimator_.predict(xvalidation)

    train_metric_dict = {
        'train_mae' : mean_absolute_error(ytrain,y_train_pred),
        'train_rmse': root_mean_squared_error(ytrain,y_train_pred),
        'train_r2'  : r2_score(ytrain,y_train_pred),
        'train_mse' : mean_squared_error(ytrain,y_train_pred)
    }

    test_metric_dict = {
        'test_mae' : mean_absolute_error(yvalidation,y_validate_pred),
        'test_rmse': root_mean_squared_error(yvalidation,y_validate_pred),
        'test_r2'  : r2_score(yvalidation,y_validate_pred),
        'test_mse' : mean_squared_error(yvalidation,y_validate_pred)
    }

    mlflow.log_metrics(train_metric_dict)
    mlflow.log_metrics(test_metric_dict)



🏃 View run best model at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/0/runs/6a0ab5b2d6824458ae0a110e2db07b6a
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/0
