In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

In [17]:
# Generate random dataset for charging sessions
def generate_random_data(num_samples=1000):
    np.random.seed(42)  # For reproducibility
    energy_consumed = np.random.uniform(1, 10, num_samples)  # Energy consumed in kWh
    duration = np.random.uniform(0.5, 5, num_samples)  # Duration in hours
    cost = energy_consumed * np.random.uniform(0.5, 1.5, num_samples) + duration * np.random.uniform(0.1, 0.3, num_samples)  # Random cost calculation
    return pd.DataFrame({
        'energy_consumed': energy_consumed,
        'duration': duration,
        'cost': cost
    })

In [18]:
# Class to store and save the model details along with evaluation metrics
class ModelObject:
    def __init__(self, model_name, model, params, best_params, evaluation_metrics, version):
        self.model_name = model_name
        self.model = model
        self.params = params
        self.best_params = best_params
        self.evaluation_metrics = evaluation_metrics
        self.version = version

    def save(self, save_path):
        joblib.dump(self, save_path)
        print(f"Model saved at: {save_path}")

In [19]:
# Base Class for Dataset Handling
class Dataset:
    def __init__(self, data):
        self.data = data
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def preprocess(self):
        # Assuming the dataset has columns 'energy_consumed', 'duration', and 'cost'
        self.data.dropna(inplace=True)  # Drop rows with missing values
        X = self.data[['energy_consumed', 'duration']]
        y = self.data['cost']

        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

In [20]:
# Base Class for Model Selection and Tuning
class ModelSelector:
    def __init__(self):
        self.models = {
            'RandomForest': RandomForestRegressor(),
            'SVM': SVR(),
            'LinearRegression': LinearRegression()
        }
        self.best_model_object = None
        self.version = 1  # Versioning starts at 1

    def hyperparameter_tuning(self, model, param_grid, X_train, y_train):
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_, grid_search.best_params_

    def select_model(self, X_train, y_train, X_test, y_test):
        # Define parameter grids for each model
        param_grids = {
            'RandomForest': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 7]},
            'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
            'LinearRegression': {}  # No hyperparameters to tune for Linear Regression
        }

        best_mae = float('inf')
        for model_name, model in self.models.items():
            print(f"Tuning {model_name}...")
            tuned_model, best_params = self.hyperparameter_tuning(model, param_grids[model_name], X_train, y_train)
            
            # Evaluate on test data
            y_pred = tuned_model.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            evaluation_metrics = {
                "MAE": mae,
                "MSE": mse
            }

            print(f"{model_name} Test MAE: {mae}, MSE: {mse}")

            # Save model object only if it is the best one
            if mae < best_mae:
                best_mae = mae
                self.best_model_object = ModelObject(
                    model_name=model_name,
                    model=tuned_model,
                    params=param_grids[model_name],
                    best_params=best_params,
                    evaluation_metrics=evaluation_metrics,
                    version=self.version
                )

        print(f"Best Model: {self.best_model_object.model_name}")
        return self.best_model_object

    def save_best_model(self):
        if self.best_model_object:
            # Create the model's versioned file name
            save_path = f"{self.best_model_object.model_name}_v{self.version}.pkl"
            self.best_model_object.save(save_path)
            self.version += 1  # Increment the version for the next save

In [21]:
# Main AutoML Pipeline
class AutoMLPipeline:
    def __init__(self):
        # Generate random dataset
        self.dataset = Dataset(generate_random_data())
        self.model_selector = ModelSelector()

    def run(self):
        # Preprocess data
        print("Preprocessing Data...")
        self.dataset.preprocess()

        # Model Selection and Evaluation
        print("Selecting the best model...")
        best_model = self.model_selector.select_model(
            self.dataset.X_train, self.dataset.y_train, 
            self.dataset.X_test, self.dataset.y_test
        )

        # Save the best model with versioning
        self.model_selector.save_best_model()


# Run the AutoML pipeline
if __name__ == "__main__":
    pipeline = AutoMLPipeline()
    pipeline.run()

Preprocessing Data...
Selecting the best model...
Tuning RandomForest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
RandomForest Test MAE: 1.3529706053087514, MSE: 3.001809942235999
Tuning SVM...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
SVM Test MAE: 1.2891903955604826, MSE: 2.789535122599327
Tuning LinearRegression...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
LinearRegression Test MAE: 1.2865600311678138, MSE: 2.7812864585614925
Best Model: LinearRegression
Model saved at: LinearRegression_v1.pkl
