In [1]:
#Importing libraries
import pandas as pd

Step 1 : Loading the dataset

In [2]:
#Readind the dataset
car_sales_data = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/" \
                              "car-sales-extended-missing-data.csv")

#Displaying the first 5 rows of the dataset
car_sales_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
#Getting info about the dataset
car_sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [4]:
#Here, we noticed the there are 50 null rows in the Price column (---> 50 unlabeled rows, so we will drop them)
car_sales_data.dropna(subset=["Price"], inplace=True)

In [5]:
#Let's check the data again
car_sales_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 950 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           903 non-null    object 
 1   Colour         904 non-null    object 
 2   Odometer (KM)  902 non-null    float64
 3   Doors          903 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 44.5+ KB


Now, we have 950 entries, and all of them are labeled

It is time to start cleaning and handiling the data (filling missing values, converting non-numerical data to numbers)

Step 2: Building a Pipeline

A_Importing classes from scikit-learn

In [7]:
#Importing SimpleImputer for handiling missing values
from sklearn.impute import SimpleImputer

#Importing OneHotEncoder for converting categorical data to numerical
from sklearn.preprocessing import OneHotEncoder

#Importing ColumnTransformer for applying transformations to specific columns
from sklearn.compose import ColumnTransformer

#Importing Pipeline for creating a sequence of transformations
from sklearn.pipeline import Pipeline

B_ Creating Preprocessing Pipelines

In [18]:
#Creating a Categorical transformer pipeline
categorical_features = ["Make", "Colour"] # List of categorical features

categorical_transformer = Pipeline(steps=[ 
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), # Filling missing values with "missing"
    ("onehot", OneHotEncoder(handle_unknown="ignore")) # Converting categorical data to numerical using OneHotEncoder
])                                                     #handle_unknown="ignore" will ignore any unknown categories during transformation

In [8]:
#Creating a door transformer pipeline
door_feature = ["Doors"] # List of door features

doors_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4)) # Filling missing values with 4 (assuming most cars have 4 doors)
])

In [None]:
#Creating a numerical transformer pipeline
numerical_features = ["Odometer (KM)"] # List of numerical features

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImSputer(strategy="median")), # Filling missing values with the median of the column
])


C_ Putting all individual transformer Pipelines into a single ColumnTransformer instance

In [21]:
preprocessor = ColumnTransformer(transformers=[
    ("categorical", categorical_transformer, categorical_features), # Applying categorical transformer to categorical features
    ("doors", doors_transformer, door_feature), # Applying doors transformer to door features
    ("numerical", numerical_transformer, numerical_features) # Applying numerical transformer to numerical features
])

Step 3: Trying Regression Models

Comparing our data to the [Scikit-Learn machine learning map](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html), we can see there's a handful of different regression models we can try.

* [RidgeRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) - A linear regression model that add L2 regularization to shrink the coefficients and reduce overfitting
* [SVR(kernel="linear")](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html) - A support vector regression model that fits a straight line relationship with a margin of tolerance arounf predictions
* [SVR(kernel="rbf")](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html) -  A support vector regression model that uses a radial basis function (rbf) kernel to capture complex, non-linear relationships
* [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) - An ensemble model that averages predictions from many decision trees trained on random subsets of data and features

A_ Importing the Models

In [11]:
#Importing RidgeRegression for linear regression with L2 regularization
from sklearn.linear_model import Ridge

#Importing SVR(kernel="linear") for support vector regression with linear kernel
from sklearn.svm import SVR

#Importing SVR(kernel="rbf") for support vector regression with radial basis function kernel
from sklearn.svm import SVR as SVR_RBF

#Importing RandomForestRegressor for ensemble regression using random forests
from sklearn.ensemble import RandomForestRegressor

B_ Creating dictionary of models

In [12]:
#dictionary of models
regression_models = {
    "Ridge": Ridge(),
    "SVR (linear)": SVR(kernel="linear"),
    "SVR (rbf)": SVR_RBF(kernel="rbf"),
    "Random Forest": RandomForestRegressor()
}

#empty dictionary for results
regression_results = {}

C_ Spliting the data into X (features Variables) and y (target variable)

In [14]:
car_sales_X = car_sales_data.drop("Price", axis=1)

car_sales_y = car_sales_data["Price"]

D_ Splitting the data into testing and training

In [15]:
#Importing train_test_split for splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

car_X_train, car_X_test, car_y_train, car_y_test = train_test_split(car_sales_X, car_sales_y, test_size=0.2, random_state=42)

E_ Getting the results

In [24]:
for model_name, model in regression_models.items():
    #Creating a pipeline with preprocessor and model
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor), # Applying preprocessor to the data
        ("model", model) # Applying the model to the preprocessed data
    ])
    
    #Fitting the model on the training data
    print(f"Fitting {model_name}...")
    pipeline.fit(car_X_train, car_y_train)
    
    #Getting the score of the model on the test data
    print(f"Scoring {model_name}...")
    score = pipeline.score(car_X_test, car_y_test)
    
    #Storing the results in the dictionary
    regression_results[model_name] = score

print("Done!")

Fitting Ridge...
Scoring Ridge...
Fitting SVR (linear)...
Scoring SVR (linear)...
Fitting SVR (rbf)...
Scoring SVR (rbf)...
Fitting Random Forest...
Scoring Random Forest...
Done!


In [25]:
#Displaying the results
for model_name, score in regression_results.items():
    print(f"{model_name}: {score:.2f}")

Ridge: 0.25
SVR (linear): -0.49
SVR (rbf): 0.00
Random Forest: 0.22


Note: All models performed poorly on this dataset.

As a beginner, I am choosing to proceed with Ridge regression for simplicity.

In the future, I plan to improve the model with better tuning and feature engineering.


Step 4: Saving the Model

In [26]:
#Importing joblib for saving the model
from joblib import dump


In [27]:
#Creating a final pipeline with preprocessor and Ridge regression model
final_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge(alpha=1.0))
])

#Fitting the final pipeline on the training data
print("Fitting final pipeline...")
final_pipeline.fit(car_X_train, car_y_train)

#Saving the final pipeline to a file
dump(final_pipeline, "car_price_predictor.joblib")

Fitting final pipeline...


['car_price_predictor.joblib']