## IMPORTING LIBRARIES

In [363]:
import pandas as pd
import numpy as np
import sklearn
from catboost import CatBoostRegressor,Pool
from sklearn.model_selection import train_test_split
import joblib
import mlflow
import mlflow.catboost

## IMPORTING DATASET

In [364]:
df=pd.read_csv("dataset.csv")
df.head(5)

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,2023 Dodge Durango Pursuit,White Knuckle Clearcoat 2023 Dodge Durango Pur...,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,2024 RAM 3500 Laramie,\n \n 2024 Ram 3500 Laramie Billet...,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive


## EXPLORING DATASET AND GATHERING DATA INFORMATION

In [365]:
df.shape # The dataset has 1002 Rows and 17 Columns/Features

(1002, 17)

In [366]:
df.describe() # It gives the basic statistical info about our data.

Unnamed: 0,year,price,cylinders,mileage,doors
count,1002.0,979.0,897.0,968.0,995.0
mean,2023.916168,50202.9857,4.975474,69.033058,3.943719
std,0.298109,18700.392062,1.392526,507.435745,0.274409
min,2023.0,0.0,0.0,0.0,2.0
25%,2024.0,36600.0,4.0,4.0,4.0
50%,2024.0,47165.0,4.0,8.0,4.0
75%,2024.0,58919.5,6.0,13.0,4.0
max,2025.0,195895.0,8.0,9711.0,5.0


In [367]:
df.dtypes

name               object
description        object
make               object
model              object
year                int64
price             float64
engine             object
cylinders         float64
fuel               object
mileage           float64
transmission       object
trim               object
body               object
doors             float64
exterior_color     object
interior_color     object
drivetrain         object
dtype: object

In [368]:
df.isnull().sum()

name                0
description        56
make                0
model               0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
exterior_color      5
interior_color     38
drivetrain          0
dtype: int64

In [369]:
df['engine'].mode()

0    16V GDI DOHC Turbo
Name: engine, dtype: object

In [370]:
df['price'].fillna(df['price'].mean())
df['engine'].fillna(df['engine'].mode())

0                                 24V GDI DOHC Twin Turbo
1                                                     OHV
2       6.2L V-8 gasoline direct injection, variable v...
3                                            16V MPFI OHV
4                                24V DDI OHV Turbo Diesel
                              ...                        
997                             16V DDI DOHC Turbo Diesel
998     4 gasoline direct injection, DOHC, Multiair va...
999                               24V GDI DOHC Twin Turbo
1000    6 DOHC, variable valve control, regular unlead...
1001    8 gasoline direct injection, variable valve co...
Name: engine, Length: 1002, dtype: object

In [372]:
df['price']=df['price'].fillna(df['price'].mean())
df['name']=df['name'].fillna('unknown')
df['make']=df['make'].fillna('unknown')
df['description']=df['description'].fillna('unknown')
df['model']=df['model'].fillna('unknown')
df['engine']=df['engine'].fillna('unknown')
df['fuel']=df['fuel'].fillna('unknown')
df['transmission']=df['transmission'].fillna('unknown')
df['trim']=df['trim'].fillna('unknown')
df['body']=df['body'].fillna('unknown')
df['exterior_color']=df['exterior_color'].fillna('unknown')
df['interior_color']=df['interior_color'].fillna('unknown')
df['drivetrain']=df['drivetrain'].fillna('unknown')

In [358]:
cat_features=['name','make', 'description','model', 'engine', 'fuel', 'transmission', 'trim',
    'body', 'exterior_color', 'interior_color', 'drivetrain']

In [359]:
X=df.drop('price',axis=1)
Y=df['price']

In [360]:
# Split into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [361]:
with mlflow.start_run():
    model=CatBoostRegressor(iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100)

    model.fit(X_train, Y_train,
          cat_features=cat_features,
          eval_set=(X_test, Y_test))

    from sklearn.metrics import mean_squared_error,r2_score
    Y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    r2 = r2_score(Y_test, Y_pred)
    print(r2)
    print(rmse)

    # Log the parameter
    mlflow.log_param("iterations",1000)
    mlflow.log_param("learning_rate",0.1)
    mlflow.log_param("depth",6)

    # Log metrics
    mlflow.log_metric("rmse",rmse)
    mlflow.log_metric("r2_score",r2)

    # Log the model
    mlflow.catboost.log_model(model,name="catboost_model")

    print(f"Logged to MLflow: RMSE={rmse:.2f}, R2={r2:.2f}")

ValueError: 'door' is not in list

In [319]:
model.save_model("vehicle_model.cbm")

In [320]:
# with mlflow.start_run():
#     model=CatBoostRegressor(iterations=900,
#     learning_rate=0.2,
#     depth=6,
#     loss_function='RMSE',
#     eval_metric='RMSE',
#     random_seed=42,
#     early_stopping_rounds=50,
#     verbose=100)
#
#     model.fit(X_train, Y_train,
#           cat_features=cat_features,
#           eval_set=(X_test, Y_test))
#
#     from sklearn.metrics import mean_squared_error,r2_score
#     Y_pred=model.predict(X_test)
#     rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
#     r2 = r2_score(Y_test, Y_pred)
#     print(r2)
#     print(rmse)
#
#     # Log the parameter
#     mlflow.log_param("iterations",900)
#     mlflow.log_param("learning_rate",0.2)
#     mlflow.log_param("depth",6)
#
#     # Log metrics
#     mlflow.log_metric("rmse",rmse)
#     mlflow.log_metric("r2_score",r2)
#
#     # Log the model
#     mlflow.catboost.log_model(model,name="catboost_model_2")
#
#     print(f"Logged to MLflow: RMSE={rmse:.2f}, R2={r2:.2f}")

In [321]:
# new_car=pd.DataFrame([{'name':'scorpio','description':'black charming','make':'Mahindra','model':'scorpioN','year':2025, 'engine':'6.2L V-8 gasoline direct injection','cylinders':4.0, 'fuel':'petrol','mileage':16.0, 'transmission':'automatic', 'trim':'','body':'XUV','doors':'4', 'exterior_color':'black', 'interior_color':'red', 'drivetrain':'4 wheel drive'}])
# new_car

In [322]:
# new_car['trim']=new_car['trim'].fillna('unknown')

In [323]:
# new_pred=model.predict(new_car)[0]
# print(f"Predicted Price: ${new_pred:,.2f}")

In [324]:
# cc=df['price'].mean()
# cc

In [325]:
# dd=df['price'].min()
# dd

In [326]:
# model.save_model('catboost_model.cbm')



## CATBOOST MODEL SAVED AND NOW WE WILL USE MLFLOW TO TRACK THE FLOW OF OUR MACHINE LEARNING MODEL.

In [327]:
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.set_experiment("Vehicle Price Prediction")


In [328]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import r2_score,mean_squared_error
# from sklearn.model_selection import train_test_split
# import numpy as np


In [329]:
# X_encoded=pd.get_dummies(X,columns=cat_features,drop_first=True)
# X_train, X_test, Y_train, Y_test = train_test_split(
#     X_encoded, Y, test_size=0.5, random_state=42
# )


In [330]:
# rf_model=RandomForestRegressor(
#     n_estimators=100,
#     max_depth=None,
#     random_state=42,
#     n_jobs=1,
# )
#
# rf_model.fit(X_train, Y_train)

In [331]:
# y_pred = rf_model.predict(X_test)
#
# rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
# r2 = r2_score(Y_test, y_pred)
#
# print(f"Random Forest RMSE: {rmse:,.2f}")
# print(f"R² Score: {r2:.4f}")
