In [1]:
import pandas as pd
car_sales = pd.read_csv("Car-sales.csv")
car_sales.head(15)

Unnamed: 0,Make,Color,Odometer(Miles),Doors,Price
0,Toyota,Black,15963.0,5.0,"$22,803.00"
1,Honda,White,65265.0,4.0,"$6,768.00"
2,Toyota,White,58803.0,4.0,"$8,662.00"
3,Honda,Red,79033.0,4.0,"$5,678.00"
4,Nissan,White,226163.0,4.0,"$3,929.00"
5,Toyota,White,60152.0,4.0,"$6,513.00"
6,BMW,Black,,5.0,"$21,287.00"
7,Toyota,Blue,59652.0,3.0,"$7,548.00"
8,Nissan,White,218857.0,4.0,"$4,470.00"
9,Toyota,Blue,53355.0,4.0,"$5,386.00"


In [2]:
car_sales.isna().sum()

Make               249
Color              244
Odometer(Miles)    252
Doors              244
Price              236
dtype: int64

In [3]:
car_sales.dtypes, len(car_sales)

(Make                object
 Color               object
 Odometer(Miles)    float64
 Doors              float64
 Price               object
 dtype: object,
 5000)

In [4]:
car_sales.dropna(subset=["Price"], inplace=True)
len(car_sales)

4764

In [5]:
car_sales["Price"] = car_sales["Price"].str.replace(r'[$,.]', '', regex=True).str[:-2]. astype(int)
car_sales.head()

Unnamed: 0,Make,Color,Odometer(Miles),Doors,Price
0,Toyota,Black,15963.0,5.0,22803
1,Honda,White,65265.0,4.0,6768
2,Toyota,White,58803.0,4.0,8662
3,Honda,Red,79033.0,4.0,5678
4,Nissan,White,226163.0,4.0,3929


In [6]:
import numpy as np
np.random.seed(42)
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

## Define different features and transformer pipeline
categorical_features = ["Make", "Color"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
     ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])
num_feature = ["Odometer(Miles)"]
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

## setup preprocessing steps(fill missing vslues and conver to numberic)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("door", door_transformer, door_feature),
        ("num", num_transformer, num_feature)
        
])

## creating a preprocessing and modelling pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor())
])

## split data
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# fit and evaluate model
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.843186418601231

In [7]:
## Using GridSearchCv with regression Pipeline 
from sklearn.model_selection import GridSearchCV
pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [10, 100, 1000],
    "model__max_depth": [None, 5, 10, 15],
    "model__max_features": ["sqrt"],
    "model__min_samples_split":[1, 2, 4]
}
gs_model = GridSearchCV(model, pipe_grid, cv=5,verbose=2)
gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=1, model__n_estimators=10, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=1, model__n_estimators=10, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=1, model__n_estimators=10, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=1, model__n_estimators=10, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=1, model__n_estimators=10, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_s

120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sumanyadav/Desktop/Projects/ML_PROJECTS/Car-sale-prediction-pipeline/env/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sumanyadav/Desktop/Projects/ML_PROJECTS/Car-sale-prediction-pipeline/env/lib/python3.13/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/sumanyadav/Desktop/Projects/ML_PROJECTS/Car-sale-prediction-pipeline/env/li

In [8]:
gs_model.score(x_test, y_test)

0.8712603641441058

In [9]:
gs_model.best_params_

{'model__max_depth': 10,
 'model__max_features': 'sqrt',
 'model__min_samples_split': 4,
 'model__n_estimators': 100,
 'preprocessor__num__imputer__strategy': 'mean'}

In [10]:
import pickle
pickle.dump(model, open("car-sales-prediction-pipeline.pkl", "wb"))

In [11]:
print("Numpy: ", np.__version__)
print("Pandas: ", pd.__version__)
print("sklearn: ", sklearn.__version__)

Numpy:  2.2.5
Pandas:  2.2.3
sklearn:  1.6.1
