# All steps for fitting best model (Final Working .pkl files)

After doing some modeling and feature engineering post EDA, I will now implement the best performing model in this notebook as well as prepare the joblib pipelining for the feature engineering. We will have all the scalers/encoders/transformers for the data serialized as well as the model, so that we can make use of it with the API.

In [1]:
import os
from pathlib import Path

# Find the current working directory
current_dir = Path().resolve()

# Traverse upwards to find the root of the repository
repo_root = current_dir
while not (repo_root / '.git').exists():
    if repo_root == repo_root.parent:
        raise FileNotFoundError("Repository root with .git directory not found")
    repo_root = repo_root.parent

# Set the working directory to the root of the repository
os.chdir(repo_root)

# Load the dataset
data_path = repo_root / 'data/bmw_pricing_challenge.csv'
source_dir = repo_root / 'source'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import joblib

training_data = pd.read_csv(data_path)

# Convert 'registration_date' and 'sold_at' to datetime
training_data['registration_date'] = pd.to_datetime(training_data['registration_date'])
training_data['sold_at'] = pd.to_datetime(training_data['sold_at'])

# Extract year and month from 'registration_date' and 'sold_at'
training_data['registration_year'] = training_data['registration_date'].dt.year
training_data['registration_month'] = training_data['registration_date'].dt.month
training_data['sold_year'] = training_data['sold_at'].dt.year
training_data['sold_month'] = training_data['sold_at'].dt.month

# Function to categorize models
def categorize_model(model):
    entry_level = ['114', '116', '118', '120', '123', '125', '135', '216', '218', '220', '225', 'X1', 'X2', 'i3', 'Z4']
    middle_level = ['316', '318', '320', '325', '328', '330', '335', '418', '420', '425', '430', '435', '518', '520', '523', '525', '528', '530', '535', 'X3', 'X4', 'i4', 'i5']
    high_end = ['630', '635', '640', '650', '730', '735', '740', '750', '8', 'X5', 'X6', 'X7', 'M135', 'M235', 'M3', 'M4', 'M5', 'M550', 'i7', 'i8']
    
    if any(model.startswith(prefix) for prefix in entry_level):
        return 'entry level'
    elif any(model.startswith(prefix) for prefix in middle_level):
        return 'middle level'
    elif any(model.startswith(prefix) for prefix in high_end):
        return 'high end'
    else:
        return 'middle level'  # Default to middle level if not found

# Apply the model categorization
training_data['model_category'] = training_data['model_key'].apply(categorize_model)

# Define the features
categorical_features = ['maker_key', 'model_key', 'fuel', 'paint_color', 'car_type', 'model_category']
numerical_features = ['mileage', 'engine_power', 'registration_year', 'registration_month', 'sold_year', 'sold_month']

# OneHotEncoder for categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(training_data[categorical_features])

# StandardScaler for numerical features
scaler = StandardScaler()
scaler.fit(training_data[numerical_features])

# PolynomialFeatures for interaction terms
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(scaler.transform(training_data[numerical_features]))

# Save the encoders and scaler for later use
source_dir = 'source'
joblib.dump(ohe, Path(source_dir) / 'ohe.pkl')
joblib.dump(scaler, Path(source_dir) / 'scaler.pkl')
joblib.dump(poly, Path(source_dir) / 'poly.pkl')

# Transform the training data
X_categorical = ohe.transform(training_data[categorical_features])
X_numerical = scaler.transform(training_data[numerical_features])
X_poly = poly.transform(X_numerical)
X = np.hstack((X_categorical, X_poly))
y = training_data['price']  # Replace 'price' with the actual target column

# Train the model
model = RandomForestRegressor()
model.fit(X, y)

# Save the model
joblib.dump(model, Path(source_dir) / 'model.pkl')


  from pandas.core import (


['source\\model.pkl']

In [3]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

# Load the saved transformers and model
source_dir = Path('source')
ohe = joblib.load(source_dir / 'ohe.pkl')
scaler = joblib.load(source_dir / 'scaler.pkl')
poly = joblib.load(source_dir / 'poly.pkl')
model = joblib.load(source_dir / 'model.pkl')

# Example input data for prediction
input_data = pd.DataFrame([{
    'maker_key': 'BMW',
    'model_key': '320i',
    'mileage': 20000,
    'engine_power': 150,
    'registration_date': '2017-05-05',
    'fuel': 'petrol',
    'paint_color': 'black',
    'car_type': 'sedan',
    'sold_at': '2018-05-05'
}])

# Convert 'registration_date' and 'sold_at' to datetime
input_data['registration_date'] = pd.to_datetime(input_data['registration_date'])
input_data['sold_at'] = pd.to_datetime(input_data['sold_at'])

# Extract year and month from 'registration_date' and 'sold_at'
input_data['registration_year'] = input_data['registration_date'].dt.year
input_data['registration_month'] = input_data['registration_date'].dt.month
input_data['sold_year'] = input_data['sold_at'].dt.year
input_data['sold_month'] = input_data['sold_at'].dt.month

# Apply the model categorization
input_data['model_category'] = input_data['model_key'].apply(categorize_model)

# Define the features
categorical_features = ['maker_key', 'model_key', 'fuel', 'paint_color', 'car_type', 'model_category']
numerical_features = ['mileage', 'engine_power', 'registration_year', 'registration_month', 'sold_year', 'sold_month']

# Transform the input data
X_categorical = ohe.transform(input_data[categorical_features])
X_numerical = scaler.transform(input_data[numerical_features])
X_poly = poly.transform(X_numerical)

# Concatenate categorical and numerical features
X = np.hstack((X_categorical, X_poly))

# Make predictions
predictions = model.predict(X)
print(predictions)


[23716.]
