## Random Forest - One parameter

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import time

start_numpoints = 5000
end_numpoints = 50000
interval = 1000

results = []

for numpoints in range(start_numpoints, end_numpoints + 1, interval):
    start_time = time.time()
    print(f"Number of Points: {numpoints}")

    data = pd.read_csv(f'datasets/Energy/fuchs_v3_points_{numpoints}_noise_10.csv')

    features = data[['Intensity_(W_cm2)', 'Target_Thickness (um)', 'Focal_Distance_(um)']]
    target = data[['Max_Proton_Energy_(MeV)']]

    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

    # Define the Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=400, max_depth=20, min_samples_split=2, min_samples_leaf=1, max_features='auto', random_state=42)

    # Fit the model on the training data
    rf.fit(features_train, target_train.values.ravel())

    # Predict on the test set
    target_test_pred = rf.predict(features_test)

    # Calculate the MSE
    mse_error = mean_squared_error(target_test, target_test_pred)

    # Calculate the RMSE
    rmse_error = np.sqrt(mse_error)

    # Calculate the ARE in percentage
    are_error = (mean_absolute_error(target_test, target_test_pred) / np.mean(target_test, axis=0)) * 100

    # Extract the scalar value from the Series object
    are_error_scalar = are_error.values.item()

    # Format the ARE to display as a percentage with two decimal places
    are_error_formatted = "{:.2f}%".format(are_error_scalar)

    # Store the results in a dictionary
    result = {
        'Number of Points': numpoints,
        'MSE': mse_error,
        'RMSE': rmse_error,
        'ARE': are_error_scalar,
        'Elapsed Time (seconds)': time.time() - start_time
    }

    results.append(result)

    print(f'MSE: {mse_error}')
    print(f'RMSE: {rmse_error}')
    print(f'ARE: {are_error_formatted}')

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('results.csv', index=False)

## Random Forest - Three Parameter

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
import time

start_numpoints = 5000
end_numpoints = 50000
interval = 1000

results = []

for numpoints in range(start_numpoints, end_numpoints + 1, interval):
    start_time = time.time()
    print(f"Number of Points: {numpoints}")

    data = pd.read_csv(f'datasets/Energy/fuchs_v3_points_{numpoints}_noise_10.csv')
    features = data[['Intensity_(W_cm2)', 'Target_Thickness (um)', 'Focal_Distance_(um)']]
    target = data[['Max_Proton_Energy_(MeV)', 'Avg_Proton_Energy_(MeV)', 'Total_Proton_Energy_(MeV)']]

    # Split the dataset into train and test sets
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

    # Define the hyperparameter grid to search
    param_grid = {
        'n_estimators': [400],
        'max_depth': [20],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'max_features': ['auto'],
    }

    # Initialize the Random Forest Regressor
    rf = RandomForestRegressor(random_state=42)

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(features_train, target_train)

    # Get the best model from the grid search
    best_model = grid_search.best_estimator_

    # Predict on the test set using the best model
    target_test_pred = best_model.predict(features_test)

    # Calculate the MSE, RMSE, and ARE for each target variable
    mse_errors = []
    rmse_errors = []
    are_errors = []
    for i, column in enumerate(target.columns):
        mse_error = mean_squared_error(target_test[column], target_test_pred[:, i])
        rmse_error = np.sqrt(mse_error)
        are_error = (mean_absolute_error(target_test[column], target_test_pred[:, i]) / np.mean(target_test[column])) * 100
        mse_errors.append(mse_error)
        rmse_errors.append(rmse_error)
        are_errors.append(are_error)

    # Store the results in a dictionary
    result = {
        'Number of Points': numpoints,
        'MSE Max_Proton_Energy': mse_errors[0],
        'RMSE Max_Proton_Energy': rmse_errors[0],
        'ARE Max_Proton_Energy': are_errors[0],
        'Elapsed Time (seconds)': time.time() - start_time
    }

    results.append(result)

    # Print the MSE, RMSE, and ARE for each target variable
    for i, column in enumerate(target.columns):
        print(f'MSE for {column}: {mse_errors[i]}')
        print(f'RMSE for {column}: {rmse_errors[i]}')
        print(f'ARE for {column}: {are_errors[i]}%')

    # Print the best hyperparameters from the grid search
    print("Best Hyperparameters:", grid_search.best_params_)

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('results.csv', index=False)
