In [None]:
import pandas as pd
import numpy as np

In [None]:
path = "Synthetic_data/experiment_results.csv"
df = pd.read_csv(path)

In [None]:
df.columns

In [None]:
iteration_columns = df.filter(regex='^Iteration')
# Convertir a numérico
iteration_columns = iteration_columns.apply(pd.to_numeric, errors='coerce')
# Convertir todos los ceros en NaN
iteration_columns = iteration_columns.replace(0, np.nan)

In [None]:
# Calculate SoH
soh = iteration_columns.div(iteration_columns.iloc[:, 0], axis=0)
soh_thresholds = pd.DataFrame()
# Create columns for the iteration in which SoH reaches 0.98, 0.95, 0.9, 0.85, and 0.8
thresholds = [0.98, 0.95, 0.9, 0.85, 0.8]
for threshold in thresholds:
    soh_thresholds[f'{threshold}'] = soh.apply(lambda row: next((i for i, v in enumerate(row) if v <= threshold), np.nan), axis=1)

soh_thresholds.isna().sum()

In [None]:
soh
# Create a new dataframe with the specified columns and the SoH thresholds
columns_of_interest = ['charge_c_rate_modulation', 'protocol_choice_prob', 'charge_soc_modulation', 'rest_duration_modulation', 'discharge_factor_modulation', 'discharge_soc_modulation']
df_with_thresholds = df[columns_of_interest].copy()

# Add the SoH thresholds to the dataframe
for threshold in thresholds:
    df_with_thresholds[f'{threshold}'] = soh_thresholds[f'{threshold}']

# Create separate dataframes for each threshold and remove NaN values
dfs_by_threshold = {}
for threshold in thresholds:
    df_threshold = df_with_thresholds[['charge_c_rate_modulation', 'protocol_choice_prob', 'charge_soc_modulation', 'rest_duration_modulation', 'discharge_factor_modulation', 'discharge_soc_modulation', f'{threshold}']].dropna()
    dfs_by_threshold[f'{threshold}'] = df_threshold

# Example: Access the dataframe for SoH 0.8
dfs_by_threshold['0.8']

In [None]:
dfs_by_threshold.keys()

In [None]:
!pip install xgboost shap

In [None]:
import xgboost as xgb
import shap
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize a dictionary to store the most important feature, metrics, and SHAP values for each threshold
most_important_features = {}
metrics = {}
shap_values_dict = {}

# Loop through each threshold and train an XGBoost model
for threshold in thresholds:
    # Prepare the data
    df_threshold = dfs_by_threshold[f'{threshold}']
    X = df_threshold[columns_of_interest]
    y = df_threshold[f'{threshold}']
    
    # Train the XGBoost model
    model = xgb.XGBRegressor()
    model.fit(X, y)
    
    # Calculate SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    shap_values_dict[f'{threshold}'] = shap_values
    
    # Determine the most important feature
    shap_importance = np.abs(shap_values.values).mean(axis=0)
    most_important_feature = columns_of_interest[np.argmax(shap_importance)]
    most_important_features[f'{threshold}'] = most_important_feature
    
    # Predict the values
    y_pred = model.predict(X)
    
    # Calculate metrics
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    metrics[f'{threshold}'] = {'MAE': mae, 'MSE': mse, 'R²': r2}
    
    # Print the most important feature and metrics for each threshold
    print(f"The most important feature for SoH {threshold} is {most_important_feature}")
    print(f"Metrics for SoH {threshold}: MAE = {mae}, MSE = {mse}, R² = {r2}")
    
    # Plot the SHAP values for the current threshold
    shap.summary_plot(shap_values, X, feature_names=columns_of_interest)
    shap.dependence_plot(most_important_feature, shap_values.values, X, feature_names=columns_of_interest)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values.values, X, feature_names=columns_of_interest)

In [None]:
!pip install mlflow


In [None]:
import mlflow
import mlflow.xgboost
import shap
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import warnings

# Suppress XGBoost model format warning
warnings.filterwarnings(action='ignore', category=UserWarning, module='xgboost')

# Initialize MLflow experiment
mlflow.set_experiment("New SoH Prediction Experiment")

# Start an MLflow run
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("data_path", path)
    mlflow.log_param("thresholds", thresholds)
    
    # Initialize a dictionary to store the most important feature, metrics, and SHAP values for each threshold
    most_important_features = {}
    metrics = {}
    shap_values_dict = {}

    # Loop through each threshold and train an XGBoost model
    for threshold in thresholds:
        # Prepare the data
        df_threshold = dfs_by_threshold[f'{threshold}']
        X = df_threshold[columns_of_interest]
        y = df_threshold[f'{threshold}']
        
        # Train the XGBoost model
        model = xgb.XGBRegressor()
        model.fit(X, y)
        
        # Calculate SHAP values
        explainer = shap.Explainer(model)
        shap_values = explainer(X)
        shap_values_dict[f'{threshold}'] = shap_values
        
        # Determine the most important feature
        shap_importance = np.abs(shap_values.values).mean(axis=0)
        most_important_feature = columns_of_interest[np.argmax(shap_importance)]
        most_important_features[f'{threshold}'] = most_important_feature
        
        # Predict the values
        y_pred = model.predict(X)
        
        # Calculate metrics
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        metrics[f'{threshold}'] = {'MAE': mae, 'MSE': mse, 'R²': r2}
        
        # Log metrics
        # Log the model with input example
        input_example = X.iloc[:5]
        mlflow.xgboost.log_model(model, f"model_{threshold}", input_example=input_example)
        mlflow.log_metric(f"R2_{threshold}", r2)
        
        # Log the model
        mlflow.xgboost.log_model(model, f"model_{threshold}")
        
        # Print the most important feature and metrics for each threshold
        print(f"The most important feature for SoH {threshold} is {most_important_feature}")
        print(f"Metrics for SoH {threshold}: MAE = {mae}, MSE = {mse}, R² = {r2}")
        
        # Plot the SHAP values for the current threshold
        shap.summary_plot(shap_values, X, feature_names=columns_of_interest)
        shap.dependence_plot(most_important_feature, shap_values.values, X, feature_names=columns_of_interest)
    
    # Log the most important features
    mlflow.log_dict(most_important_features, "most_important_features.json")
    
    # Log the SHAP values
    for threshold, shap_values in shap_values_dict.items():
        shap_values_file = f"shap_values_{threshold}.pkl"
        with open(shap_values_file, "wb") as f:
            pickle.dump(shap_values, f)
        mlflow.log_artifact(shap_values_file)

    # Log the experimental setup
    mlflow.log_param("experimental_setup", {
        "data_path": path,
        "columns_of_interest": columns_of_interest,
        "thresholds": thresholds
    })

    # Log the preprocessing steps
    mlflow.log_param("preprocessing_steps", {
        "iteration_columns_conversion": "Converted to numeric and replaced zeros with NaN",
        "soh_calculation": "Calculated SoH and created thresholds",
        "dataframe_creation": "Created dataframes for each threshold and removed NaN values"
    })

    # Log the model training and evaluation details
    mlflow.log_param("model_training", {
        "model_type": "XGBoost",
        "evaluation_metrics": ["MAE", "MSE", "R²"]
    })

    # Log the most important features
    mlflow.log_dict(most_important_features, "most_important_features.json")

    # Log the SHAP values
    for threshold, shap_values in shap_values_dict.items():
        shap_values_file = f"shap_values_{threshold}.pkl"
        with open(shap_values_file, "wb") as f:
            pickle.dump(shap_values, f)
        mlflow.log_artifact(shap_values_file)

    # Track the specific file of the experimental result
    # This will log the experimental result file to MLflow
    mlflow.log_artifact(path)


In [None]:
mlflow.end_run()

In [None]:
!pip install -U "ray[data,train,tune,serve]"

In [None]:
import mlflow
import mlflow.xgboost
import shap
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import warnings
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler

# Suppress XGBoost model format warning
warnings.filterwarnings(action='ignore', category=UserWarning, module='xgboost')

# Initialize MLflow experiment
mlflow.set_experiment("New SoH Prediction Experiment")

# Define the search space for hyperparameters
search_space = {
    "learning_rate": tune.loguniform(0.01, 0.1),
    "max_depth": tune.randint(3, 10),
    "min_child_weight": tune.randint(1, 6),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0)
}

# Define the training function
def train_model(config, threshold, X, y):
    import xgboost as xgb
    from sklearn.metrics import mean_absolute_error
    # No need to import tune again if already imported globally
    model = xgb.XGBRegressor(
        learning_rate=config["learning_rate"],
        max_depth=config["max_depth"],
        min_child_weight=config["min_child_weight"],
        subsample=config["subsample"],
        colsample_bytree=config["colsample_bytree"]
    )
    model.fit(X, y)
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    train.report({"mae":mae})

# Define or load necessary variables before running the script
# For example:
# path = "path_to_your_data.csv"
# thresholds = [0.8, 0.85, 0.9]
# columns_of_interest = ["feature1", "feature2", "feature3"]
# dfs_by_threshold = {"0.8": df1, "0.85": df2, "0.9": df3}

# Start an MLflow run
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("data_path", path)
    mlflow.log_param("thresholds", thresholds)
    
    # Initialize dictionaries to store results
    most_important_features = {}
    metrics = {}
    shap_values_dict = {}

    # Loop through each threshold and train an XGBoost model
    for threshold in thresholds:
        # Prepare the data
        df_threshold = dfs_by_threshold[str(threshold)]
        X = df_threshold[columns_of_interest]
        y = df_threshold[str(threshold)]
        
        # Perform hyperparameter tuning with Ray Tune
        scheduler = ASHAScheduler(
            metric="mae",
            mode="min",
            max_t=10,
            grace_period=1,
            reduction_factor=2
        )
        
        # Sample a subset of the data to reduce memory usage
        X_sample = X.sample(frac=0.5, random_state=42)
        y_sample = y.loc[X_sample.index]

        analysis = tune.run(
            tune.with_parameters(train_model, threshold=threshold, X=X_sample, y=y_sample),
            config=search_space,
            num_samples=10,
            scheduler=scheduler,
             max_concurrent_trials=1  # Limit the number of concurrent trials
            # For newer versions of Ray, limit concurrency using ConcurrencyLimiter if needed
        )
        
        # Get the best hyperparameters
        best_config = analysis.get_best_config(metric="mae", mode="min")
        
        # Train the XGBoost model with the best hyperparameters
        model = xgb.XGBRegressor(
            learning_rate=best_config["learning_rate"],
            max_depth=best_config["max_depth"],
            min_child_weight=best_config["min_child_weight"],
            subsample=best_config["subsample"],
            colsample_bytree=best_config["colsample_bytree"]
        )
        model.fit(X, y)
        
        # Calculate SHAP values
        explainer = shap.Explainer(model)
        shap_values = explainer(X)
        shap_values_dict[str(threshold)] = shap_values
        
        # Determine the most important feature
        shap_importance = np.abs(shap_values.values).mean(axis=0)
        most_important_feature = columns_of_interest[np.argmax(shap_importance)]
        most_important_features[str(threshold)] = most_important_feature
        
        # Predict the values
        y_pred = model.predict(X)
        
        # Calculate metrics
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        metrics[str(threshold)] = {'MAE': mae, 'MSE': mse, 'R²': r2}
        
        # Log metrics
        # Log the model with input example
        input_example = X.iloc[:5]
        mlflow.xgboost.log_model(model, f"model_{threshold}", input_example=input_example)
        mlflow.log_metric(f"R2_{threshold}", r2)
        
        # Print the most important feature and metrics for each threshold
        print(f"The most important feature for SoH {threshold} is {most_important_feature}")
        print(f"Metrics for SoH {threshold}: MAE = {mae}, MSE = {mse}, R² = {r2}")
        
        # Plot the SHAP values for the current threshold
        # Save the plots instead of displaying them
        shap.summary_plot(shap_values, X, feature_names=columns_of_interest, show=False)
        plt.savefig(f"shap_summary_{threshold}.png")
        mlflow.log_artifact(f"shap_summary_{threshold}.png")
        plt.close()

        shap.dependence_plot(most_important_feature, shap_values.values, X, feature_names=columns_of_interest, show=False)
        plt.savefig(f"shap_dependence_{threshold}.png")
        mlflow.log_artifact(f"shap_dependence_{threshold}.png")
        plt.close()
    
    # Log the most important features
    mlflow.log_dict(most_important_features, "most_important_features.json")
    
    # Log the SHAP values
    for threshold, shap_values in shap_values_dict.items():
        shap_values_file = f"shap_values_{threshold}.pkl"
        with open(shap_values_file, "wb") as f:
            pickle.dump(shap_values, f)
        mlflow.log_artifact(shap_values_file)

    # Log the experimental setup
    mlflow.log_param("experimental_setup", {
        "data_path": path,
        "columns_of_interest": columns_of_interest,
        "thresholds": thresholds
    })

    # Log the preprocessing steps
    mlflow.log_param("preprocessing_steps", {
        "iteration_columns_conversion": "Converted to numeric and replaced zeros with NaN",
        "soh_calculation": "Calculated SoH and created thresholds",
        "dataframe_creation": "Created dataframes for each threshold and removed NaN values"
    })

    # Log the model training and evaluation details
    mlflow.log_param("model_training", {
        "model_type": "XGBoost",
        "evaluation_metrics": ["MAE", "MSE", "R²"]
    })

    # Track the specific file of the experimental result
    # This will log the experimental result file to MLflow
    mlflow.log_artifact(path)
