In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import  header
import random
random.seed(1)
import numpy as np
np.random.seed(1)

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False

import seaborn as sns
import re

import plotly.graph_objects as go

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneGroupOut

from sklearn.feature_selection import RFECV

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge,Lasso,ElasticNet,SGDRegressor,LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# from base_function.base import *

path = %pwd

def mean_relative_error(y_true, y_pred):
    import numpy as np
    relative_error = np.average(np.abs(y_true - y_pred) / y_true, axis=0)
    return relative_error

def Linear_SVR(C=1.0,gamma=0.1,epsilon=1):
    return Pipeline([
        ("std_scaler",StandardScaler()),
        ("model",SVR(kernel="linear",C=C,gamma=gamma,epsilon=epsilon))
    ])
def RBF_SVR(C=1.0,gamma=1,epsilon=1):
    return Pipeline([
        ("std_scaler",StandardScaler()),
        ("model",SVR(kernel="rbf",C=C,gamma=gamma,epsilon=epsilon))
    ])
def Poly_LinearRegression(degree=2):
    return Pipeline([('poly', PolynomialFeatures(degree=degree)),
                  ('linear', LinearRegression())])
def draw_feature_importance(features,feature_importance):
    """
    features: name
    feature_importance:
    """
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    fig = plt.figure(dpi=400)
    plt.barh(pos, list(feature_importance[sorted_idx]), align='center')
    plt.yticks(pos, list(features[sorted_idx]))
    plt.xlabel('Relative Importance')
    plt.title('Feature Importance')
    plt.show()
def model_fit_evaluation(model, x_train, y_train, x_test, y_test, n_fold=5):
    """clf:
    x_train：
    kf = KFold(n_splits=n_fold,shuffle=True,random_state=0)
    print(model)
    result = pd.DataFrame()
    for i, (train_index, test_index) in enumerate(kf.split(range(len(x_train)))):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_validation = x_train[test_index]  # get validation set
        y_validation = y_train[test_index]
        model.fit(x_tr, y_tr)

        result_subset = pd.DataFrame()  # save the prediction
        result_subset["y_validation"] = y_validation
        result_subset["y_pred"] = model.predict(x_validation)
        result = result.append(result_subset)
    print("cross_validation_error in validation set：")
    c = evaluate_model_plot(result["y_validation"],result["y_pred"],show=False)

    print("error in testing set：")
    model.fit(x_train, y_train)
    y_test_pred = model.predict(x_test)
    error_metric_testing = evaluate_model_plot(y_test,y_test_pred,show=False)
    print("====================================")
    return error_metric_testing
def fatigue_mre_metric(y_true, y_predict):
    """
    calculate the mre of fatigue life
    """
    return mean_relative_error(np.power(10,y_true), np.power(10,y_predict))
# LOGCV for selection features
def get_score_logcv(X_train,Y_train,groups_array_train,model):
    """
    groups_array_train:groups of S-N curve
    return the MRE of LOGCV
    """
    logo = LeaveOneGroupOut()
    y_predict_list = []
    y_true_list = []
    index_list = []

    x = X_train.values
    y = Y_train.values
    for train_index, test_index in logo.split(x, y, groups_array_train):# LOOGV
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(x_train,y_train)
        y_predict_list.extend(model.predict(x_test))
        y_true_list.extend(y_test)
        index_list.extend(index[test_index])

    # calculate metric
    mre = mean_relative_error(np.power(10,y_true_list), np.power(10,y_predict_list))
    return mre

In [None]:
#1 reading data
dataset = pd.read_excel('/content/data (1).xlsx',sheet_name="fatigue_strength")
# extract tempers
condition_list = []
# same alloy with different temps has same name
name_list=[]
for i in dataset["url"]:
    #print(i)
    condition = re.findall("-([OF]|[HT]\d+)-",i)[0]
    condition_list.append(condition)
    name = i.replace(r"https://www.makeitfrom.com/material-properties/","")
    name = name.replace(condition,"")
    name_list.append(name)
dataset["tempers"] = condition_list
dataset = dataset.drop(["url"],axis=1)
dataset = dataset.fillna(0)
dataset.head()

Unnamed: 0,Al,Fe,Si,Zn,Cu,V,Zr,Ag,Mn,Ni,...,Mg,B,Ga,Cr,Pb,Bi,Li,Co,Fatigue Strength,tempers
0,99.75,0.2,0.12,0.03,0.03,0.03,0.0,0.0,0.03,0.0,...,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31,H112
1,99.8,0.175,0.125,0.025,0.025,0.025,0.0,0.0,0.015,0.0,...,0.015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,H112
2,99.65,0.35,0.35,0.025,0.05,0.0,0.0,0.0,0.025,0.0,...,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47,H112
3,99.525,0.4,0.075,0.05,0.01,0.0,0.0,0.0,0.025,0.0,...,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42,H12
4,99.525,0.4,0.075,0.05,0.01,0.0,0.0,0.0,0.025,0.0,...,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49,H14


In [None]:
#2 data processing
# condition label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset["tempers"]=le.fit_transform(dataset["tempers"])

# save the LabelEncoder
import pickle
with open ("le_tempers.txt", 'wb') as f:
    pickle.dump(le, f)

In [None]:
#3 machine leanrning model
Y_columns = "Fatigue Strength"
X = dataset.drop([Y_columns],axis=1).values
Y = dataset[Y_columns].values
X_columns = dataset.drop([Y_columns],axis=1).columns

In [None]:
X[:10]

array([[9.9750e+01, 2.0000e-01, 1.2000e-01, 3.0000e-02, 3.0000e-02,
        3.0000e-02, 0.0000e+00, 0.0000e+00, 3.0000e-02, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e-02, 3.0000e-02, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 3.0000e+00],
       [9.9800e+01, 1.7500e-01, 1.2500e-01, 2.5000e-02, 2.5000e-02,
        2.5000e-02, 0.0000e+00, 0.0000e+00, 1.5000e-02, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.5000e-02, 1.5000e-02, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 3.0000e+00],
       [9.9650e+01, 3.5000e-01, 3.5000e-01, 2.5000e-02, 5.0000e-02,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 2.5000e-02, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 2.5000e-02, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 3.0000e+00],
       [9.9525e+01, 4.0000e-01, 7.5000e-02, 5.0000e-02, 1.0000e-02,
        0.0000e+0

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.3,random_state=0)

In [None]:
# hyper-paramerer
def hyper_param_opt_model(X, y,model,param_grid):
    """
    hyper-paramerer optimize by GridSearch
    """
    gsc = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring= 'neg_mean_squared_error',
        verbose=0,
        n_jobs=-1)

    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    return best_params

print('The best parameters: ',hyper_param_opt_model(X_train,Y_train,RandomForestRegressor(random_state=1),param_grid={
            'max_depth': range(11,16),
            'n_estimators': (5,50,100,200,1000)}))

The best parameters:  {'max_depth': 15, 'n_estimators': 1000}


In [None]:
from sklearn.metrics import r2_score

from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

# Define base estimator for Bagging Regressor
base_estimator = DecisionTreeRegressor(max_depth=5)

# Train the Bagging Regressor model
bagging_model = BaggingRegressor(base_estimator=base_estimator, n_estimators=100, random_state=1)
bagging_model.fit(X_train, Y_train)

# Make predictions using Bagging Regressor
Y_pred_bagging = bagging_model.predict(X_test)

# Evaluate the Bagging Regressor model
mse_bagging = mean_squared_error(Y_test, Y_pred_bagging)
r2_bagging = r2_score(Y_test, Y_pred_bagging)

print('Bagging Regressor Model Evaluation:')
print('Mean Squared Error (MSE): ', mse_bagging)
print('R-squared (R2) Score: ', r2_bagging)


Bagging Regressor Model Evaluation:
Mean Squared Error (MSE):  354.4911761434549
R-squared (R2) Score:  0.8159638974892081


In [None]:
from sklearn.ensemble import ExtraTreesRegressor

# Train the ExtraTreesRegressor model
extra_trees_model = ExtraTreesRegressor(random_state=1)
extra_trees_model.fit(X_train, Y_train)

# Make predictions using ExtraTreesRegressor
Y_pred_extra_trees = extra_trees_model.predict(X_test)

# Evaluate the ExtraTreesRegressor model
mse_extra_trees = mean_squared_error(Y_test, Y_pred_extra_trees)
r2_extra_trees = r2_score(Y_test, Y_pred_extra_trees)

print('ExtraTreesRegressor Model Evaluation:')
print('Mean Squared Error (MSE): ', mse_extra_trees)
print('R-squared (R2) Score: ', r2_extra_trees)


ExtraTreesRegressor Model Evaluation:
Mean Squared Error (MSE):  331.37771385383803
R-squared (R2) Score:  0.8279633823891929


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

# Defining hyper-parameter optimization function
def hyper_param_opt_model(X, y, model, param_grid):
    """
    Hyper-parameter optimization by GridSearch
    """
    gsc = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1
    )

    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    return best_params

# Defining the parameter grid for RandomForestRegressor
param_grid_rgr = {
    'max_depth': range(11, 16),
    'n_estimators': (5, 50, 100, 200, 1000)
}

# Finding the best parameters for RandomForestRegressor
best_params_rgr = hyper_param_opt_model(X_train, Y_train, RandomForestRegressor(random_state=1), param_grid_rgr)
print('The best parameters for RandomForestRegressor: ', best_params_rgr)


The best parameters for RandomForestRegressor:  {'max_depth': 15, 'n_estimators': 1000}


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Train the model with the best parameters
best_rgr_model = RandomForestRegressor(random_state=1, **best_params_rgr)
best_rgr_model.fit(X_train, Y_train)

# Make predictions
Y_pred = best_rgr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print('Mean Squared Error (MSE): ', mse)
print('R-squared (R2) Score: ', r2)

Mean Squared Error (MSE):  245.4510024706385
R-squared (R2) Score:  0.872572721432753


In [None]:
# pip install catboost

# # from catboost import CatBoostRegressor
# # from sklearn.model_selection import GridSearchCV

# # # Define the parameter grid for CatBoostRegressor
# # param_grid_catboost = {
# #     'learning_rate': [0.01, 0.05, 0.1],
# #     'depth': [3, 5, 7],
# #     'l2_leaf_reg': [1, 3, 5]
# # }

# # # Instantiate the CatBoostRegressor model
# # catboost_model = CatBoostRegressor(iterations=100, verbose=0, random_state=1)

# # # Instantiate GridSearchCV for CatBoostRegressor
# # grid_search_catboost = GridSearchCV(estimator=catboost_model, param_grid=param_grid_catboost, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# # # Fit the model
# # grid_search_catboost.fit(X_train, Y_train)

# # # Get the best parameters and the best estimator for CatBoostRegressor
# # best_params_catboost = grid_search_catboost.best_params_
# # best_catboost_model = grid_search_catboost.best_estimator_

# # # Make predictions using the best CatBoostRegressor model
# # Y_pred_catboost = best_catboost_model.predict(X_test)

# # # Evaluate the best CatBoostRegressor model
# # mse_catboost = mean_squared_error(Y_test, Y_pred_catboost)
# # r2_catboost = r2_score(Y_test, Y_pred_catboost)

# # print('CatBoostRegressor Model Evaluation:')
# # print('Mean Squared Error (MSE): ', mse_catboost)
# # print('R-squared (R2) Score: ', r2_catboost)


In [None]:
import numpy as np
from itertools import product

# Define a grid of feature values for each feature (assuming X has n features)
feature_grid = [np.linspace(min_value, max_value, num_points) for min_value, max_value, num_points in zip(X.min(axis=0), X.max(axis=0), [10]*n_features)]

# Generate all possible combinations of feature values
feature_combinations = product(*feature_grid)

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Iterate through each combination of feature values
for features in feature_combinations:
    # Predict the target variable using the CatBoostRegressor model
    predicted_value = catboost_model.predict([features])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features

print('Optimal feature values:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


NameError: name 'n_features' is not defined

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for LightGBMRegressor
param_grid_lgbm = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63]
}

# Instantiate the LGBMRegressor model
lgbm_model = LGBMRegressor(random_state=1)

# Instantiate GridSearchCV for LGBMRegressor
grid_search_lgbm = GridSearchCV(estimator=lgbm_model, param_grid=param_grid_lgbm, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the model
grid_search_lgbm.fit(X_train, Y_train)

# Get the best parameters and the best estimator for LGBMRegressor
best_params_lgbm = grid_search_lgbm.best_params_
best_lgbm_model = grid_search_lgbm.best_estimator_

# Make predictions using the best LGBMRegressor model
Y_pred_lgbm = best_lgbm_model.predict(X_test)

# Evaluate the best LGBMRegressor model
mse_lgbm = mean_squared_error(Y_test, Y_pred_lgbm)
r2_lgbm = r2_score(Y_test, Y_pred_lgbm)

print('LightGBMRegressor Model Evaluation:')
print('Mean Squared Error (MSE): ', mse_lgbm)
print('R-squared (R2) Score: ', r2_lgbm)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the RandomForestRegressor model
rf_model = RandomForestRegressor(random_state=1)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_train, Y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Make predictions using the best RandomForestRegressor model
Y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the best RandomForestRegressor model
mse_rf = mean_squared_error(Y_test, Y_pred_rf)
r2_rf = r2_score(Y_test, Y_pred_rf)

print('RandomForestRegressor Model Evaluation:')
print('Mean Squared Error (MSE): ', mse_rf)
print('R-squared (R2) Score: ', r2_rf)


In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
from catboost import CatBoostRegressor
# Train the CatBoostRegressor model
catboost_model = CatBoostRegressor(random_state=1, verbose=0)
catboost_model.fit(X_train, Y_train)

# Make predictions using CatBoostRegressor
Y_pred_catboost = catboost_model.predict(X_test)

# Evaluate the CatBoostRegressor model
mse_catboost = mean_squared_error(Y_test, Y_pred_catboost)
r2_catboost = r2_score(Y_test, Y_pred_catboost)

print('\nCatBoostRegressor Model Evaluation:')
print('Mean Squared Error (MSE): ', mse_catboost)
print('R-squared (R2) Score: ', r2_catboost)


CatBoostRegressor Model Evaluation:
Mean Squared Error (MSE):  219.40607205927685
R-squared (R2) Score:  0.8860940946167559


In [None]:
import numpy as np
from itertools import product

# Define a grid of feature values for each feature (assuming X has n features)
feature_grid = [np.linspace(min_value, max_value, num_points) for min_value, max_value, num_points in zip(X.min(axis=0), X.max(axis=0), [10]*n_features)]

# Generate all possible combinations of feature values
feature_combinations = product(*feature_grid)

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Iterate through each combination of feature values
for features in feature_combinations:
    # Predict the target variable using the CatBoostRegressor model
    predicted_value = catboost_model.predict([features])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features

print('Optimal feature values:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


NameError: name 'n_features' is not defined

In [None]:
# Retrieve feature importance scores from the trained CatBoostRegressor model
feature_importance = catboost_model.feature_importances_

# Visualize feature importance scores (optional)
# You can use matplotlib or seaborn for visualization

# Identify the most important features based on feature importance scores
most_important_features = np.argsort(feature_importance)[::-1][:k]  # Choose top 'k' most important features

# Define a grid of feature values for the most important features
feature_grid_most_important = [np.linspace(min_value, max_value, num_points) for min_value, max_value, num_points in zip(X[:, most_important_features].min(axis=0), X[:, most_important_features].max(axis=0), [10]*len(most_important_features))]

# Generate all possible combinations of feature values for the most important features
feature_combinations_most_important = product(*feature_grid_most_important)

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Iterate through each combination of feature values for the most important features
for features_most_important in feature_combinations_most_important:
    # Create a copy of the original feature values and update the most important feature values
    features = X[0].copy()
    features[most_important_features] = features_most_important

    # Predict the target variable using the CatBoostRegressor model
    predicted_value = catboost_model.predict([features])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


In [None]:
# Define the number of top features to consider
k = 5  # Adjust this value based on your preference

# Identify the most important features based on feature importance scores
most_important_features = np.argsort(feature_importance)[::-1][:k]

# Define a grid of feature values for the most important features
feature_grid_most_important = [np.linspace(min_value, max_value, num_points) for min_value, max_value, num_points in zip(X[:, most_important_features].min(axis=0), X[:, most_important_features].max(axis=0), [10]*len(most_important_features))]

# Generate all possible combinations of feature values for the most important features
feature_combinations_most_important = product(*feature_grid_most_important)

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Iterate through each combination of feature values for the most important features
for features_most_important in feature_combinations_most_important:
    # Create a copy of the original feature values and update the most important feature values
    features = X[0].copy()
    features[most_important_features] = features_most_important

    # Predict the target variable using the CatBoostRegressor model
    predicted_value = catboost_model.predict([features])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


In [None]:
# Define a grid of feature values for all features
feature_grid_all = [np.linspace(min_value, max_value, num_points) for min_value, max_value, num_points in zip(X.min(axis=0), X.max(axis=0), [10]*X.shape[1])]

# Generate all possible combinations of feature values for all features
feature_combinations_all = product(*feature_grid_all)

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Iterate through each combination of feature values for all features
for features_all in feature_combinations_all:
    # Predict the target variable using the CatBoostRegressor model
    predicted_value = catboost_model.predict([features_all])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_all

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


In [None]:
import random

# Define the number of random samples to generate
num_samples = 1000

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Iterate through each sample
for _ in range(num_samples):
    # Generate random feature values
    random_features = [random.uniform(min_value, max_value) for min_value, max_value in zip(X.min(axis=0), X.max(axis=0))]

    # Predict the target variable using the CatBoostRegressor model
    predicted_value = catboost_model.predict([random_features])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = random_features

# Retrieve feature names (column titles) from the dataset
feature_names = dataset.columns.tolist()

# Create a dictionary to store feature names and their corresponding values
feature_names_values = {feature_name: feature_value for feature_name, feature_value in zip(feature_names, optimal_feature_values)}

print('Optimal feature values for highest predicted Y value:')
for feature_name, feature_value in feature_names_values.items():
    print(feature_name + ':', feature_value)
print('Predicted highest Y value:', max_predicted_value)


Optimal feature values for highest predicted Y value:
Al: 78.87271669062264
Fe: 0.06953463149706618
Si: 0.042013689305253246
Zn: 2.0019512497367478
Cu: 1.0203755496209859
V: 0.03143784525357092
Zr: 0.03237143867770236
Ag: 0.26032161889845584
Mn: 0.12508027755874992
Ni: 0.8515624392066721
Sn: 0.947714154980419
Be: 0.11068670960124401
Ti: 0.022880042986721666
Mg: 2.804814277559492
B: 0.018281773050379754
Ga: 0.02078494884777217
Cr: 0.04914379908719274
Pb: 0.06320873820564676
Bi: 0.04982570470181524
Li: 0.07896077934013257
Co: 0.0009600861261754496
Fatigue Strength: 74.59202779998523
Predicted highest Y value: [160.45015977]


In [None]:
Y

In [None]:
# Define the number of random samples to generate
num_samples = 100000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X.min(axis=0), X.max(axis=0))]

    # Predict the target variable using the CatBoostRegressor model
    predicted_value = catboost_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


Optimal feature values for highest predicted Y value: [78.48678080109418, 0.2098362981111231, 0.22412939421904013, 2.6406663443783147, 7.065739859699131, 0.09074208978799701, 0.18612851873782, 0.1424673328154528, 0.42222769240333125, 0.2490183665259446, 1.132033979330438, 0.1532310517527692, 0.010419198552194808, 3.5162881981889016, 0.029677403105913082, 0.019290168513244818, 0.10941844318648863, 0.04531399232938575, 0.6352407333505401, 0.7239714507850129, 0.005628460917412173, 84.07947149156566]
Predicted highest Y value: [157.7080181]


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 1000000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rfr_model, param_distributions=param_grid, n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=1, n_jobs=-1)

# Perform hyperparameter tuning
random_search.fit(X_train, Y_train)

# Get the best RandomForestRegressor model
best_rfr_model = random_search.best_estimator_

# Define the number of random samples to generate
num_samples = 1000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the best RandomForestRegressor model
    predicted_value = best_rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of iterations for refinement
num_iterations = 100000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Start with random feature values
current_feature_values = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

# Iterate to refine feature values towards maximum predicted Y value
for _ in range(num_iterations):
    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([current_feature_values])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = current_feature_values.copy()

    # Randomly perturb one feature value
    index_to_perturb = np.random.randint(len(current_feature_values))
    current_feature_values[index_to_perturb] = np.random.uniform(X_train.min(axis=0)[index_to_perturb], X_train.max(axis=0)[index_to_perturb])

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


In [None]:
# Define the learning rate for gradient ascent
learning_rate = 0.01  # Adjust this value based on your preference

# Initialize feature values randomly
current_feature_values = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

# Define the number of iterations for gradient ascent
num_iterations = 1000  # Adjust this value based on your preference

# Iterate to optimize feature values using gradient ascent
for _ in range(num_iterations):
    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([current_feature_values])[0]

    # Compute the gradient of the predicted Y value with respect to the feature values
    gradient = np.zeros(len(current_feature_values))
    for i in range(len(current_feature_values)):
        # Perturb the feature value
        perturbed_feature_values = current_feature_values.copy()
        perturbed_feature_values[i] += 0.0001  # Small perturbation

        # Compute the perturbed predicted Y value
        perturbed_predicted_value = rfr_model.predict([perturbed_feature_values])[0]

        # Compute the gradient using finite differences
        gradient[i] = (perturbed_predicted_value - predicted_value) / 0.0001

    # Update feature values using gradient ascent
    current_feature_values += learning_rate * gradient

# Predict the target variable using the final feature values
predicted_value = rfr_model.predict([current_feature_values])[0]

print('Optimal feature values for highest predicted Y value:', current_feature_values)
print('Predicted highest Y value:', predicted_value)


In [None]:
# Define the learning rate for gradient ascent
learning_rate = 0.01  # Adjust this value based on your preference

# Initialize feature values randomly
current_feature_values = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

# Define the number of iterations for gradient ascent
num_iterations = 1000  # Adjust this value based on your preference

# Iterate to optimize feature values using gradient ascent
for _ in range(num_iterations):
    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([current_feature_values])[0]

    # Compute the gradient of the predicted Y value with respect to the feature values
    gradient = np.zeros(len(current_feature_values))
    for i in range(len(current_feature_values)):
        # Perturb the feature value
        perturbed_feature_values = current_feature_values.copy()
        perturbed_feature_values[i] += 0.0001  # Small perturbation

        # Compute the perturbed predicted Y value
        perturbed_predicted_value = rfr_model.predict([perturbed_feature_values])[0]

        # Compute the gradient using finite differences
        gradient[i] = (perturbed_predicted_value - predicted_value) / 0.0001

    # Update feature values using gradient ascent
    current_feature_values += learning_rate * gradient

# Predict the target variable using the final feature values
predicted_value = rfr_model.predict([current_feature_values])[0]

print('Optimal feature values for highest predicted Y value:', current_feature_values)
print('Predicted highest Y value:', predicted_value)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 100000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

Optimal feature values for highest predicted Y value: [89.43060679934018, 0.7494843837198794, 0.1397695419783953, 1.5896496186600395, 0.6507715210363575, 0.08856358739566314, 0.18238332190705897, 0.2869716435308236, 1.085815575555214, 1.2162250863494404, 3.588205567218291, 0.11119293395955179, 0.027503815937186593, 5.326782513089788, 0.008531116367407665, 0.015741001083844824, 0.1644734618515073, 0.3472483482772754, 0.7673350141417493, 0.4355806556130649, 0.005609150657893917, 18.048065179993266]
Predicted highest Y value: [200.03]


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=2)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 10000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

Optimal feature values for highest predicted Y value: [84.93667436550683, 0.30938171127030684, 0.052972264832087335, 7.731520131841515, 5.010745072099786, 0.10560368766869152, 0.01016799124692025, 0.40272757869259157, 0.6234480476445756, 2.2359369541048837, 2.842993778560405, 0.15509229104851385, 0.012837240601386773, 6.161256155170042, 0.005963500917715316, 0.0009158201218520817, 0.20335205870639428, 0.9677134496114168, 0.4941540169122949, 0.07214211858149744, 0.009933516638317031, 54.84176531505142]
Predicted highest Y value: [199.06]


In [None]:
# Define the target Y value
target_y = 215

# Define the learning rate for gradient ascent
learning_rate = 0.01  # Adjust this value based on your preference

# Initialize feature values randomly
current_feature_values = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

# Define the maximum number of iterations
max_iterations = 10000

# Define a tolerance level for the target Y value
tolerance = 0.1  # Adjust this value based on your preference

# Iterate to find feature values for the target Y value
for iteration in range(max_iterations):
    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([current_feature_values])[0]

    # Check if the predicted Y value is close to the target Y value
    if abs(predicted_value - target_y) < tolerance:
        break  # Stop iteration if the predicted Y value is within the tolerance level of the target Y value

    # Compute the gradient of the predicted Y value with respect to the feature values
    gradient = np.zeros(len(current_feature_values))
    for i in range(len(current_feature_values)):
        # Perturb the feature value
        perturbed_feature_values = current_feature_values.copy()
        perturbed_feature_values[i] += 0.0001  # Small perturbation

        # Compute the perturbed predicted Y value
        perturbed_predicted_value = rfr_model.predict([perturbed_feature_values])[0]

        # Compute the gradient using finite differences
        gradient[i] = (perturbed_predicted_value - predicted_value) / 0.0001

    # Update feature values using gradient ascent
    current_feature_values += learning_rate * gradient

# Predict the target variable using the final feature values
predicted_value = rfr_model.predict([current_feature_values])[0]

print('Optimal feature values for Y = 215:', current_feature_values)
print('Predicted Y value:', predicted_value)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 1000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

NameError: name 'X' is not defined

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 10000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

Optimal feature values for highest predicted Y value: [93.06532219319618, 1.1358368423191556, 0.07166277990321451, 0.5787168512552684, 2.137470874650111, 0.07782550963323397, 0.15447277996536837, 0.6245316194705949, 1.1647191965482446, 0.7553097117160251, 4.430129884786545, 0.11436197038202107, 0.07946567894255721, 9.126822201587755, 0.004091834339674928, 0.015305181554451902, 0.3996683787950368, 1.115531755345818, 0.11815303919348974, 0.8679860803506013, 0.002620628075767753, 19.737372631195527]
Predicted highest Y value: [198.1]


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 100000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

Optimal feature values for highest predicted Y value: [84.83658367694764, 0.09953181783766127, 0.20243126142980003, 6.893031583059888, 4.5386687335783975, 0.0383451060751409, 0.04342996991034294, 0.09016071539810683, 0.3440765489926781, 0.030727959099520752, 0.9910473697604547, 0.09214902281492886, 0.20482838220776278, 8.250944681546214, 0.00620355571535783, 0.0011432501212209456, 0.20129243522381646, 0.07932038440415345, 0.31642409574498964, 0.9441230739835622, 0.02419591915815572, 78.34412194948513]
Predicted highest Y value: [199.3]


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=2)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 1000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

Optimal feature values for highest predicted Y value: [85.28911401241461, 0.0903886293183573, 0.3739976247269026, 6.819302365786251, 2.801894822796146, 0.12293001322965053, 0.17724724334754094, 0.27244926312475215, 0.05485976649426516, 0.5713177346710943, 2.4335831653937996, 0.1927233149532187, 0.23622742738346802, 3.7111338187579195, 0.025587508179156965, 0.013876755355855322, 0.24065631339378613, 0.1948810165209074, 0.5033552618134874, 0.05900556665122125, 0.014991268112334494, 55.05115380289267]
Predicted highest Y value: [197.4]


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=2)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 10000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)

Optimal feature values for highest predicted Y value: [83.0076262519024, 0.10706105737198342, 0.07750317720678176, 6.461781227874871, 7.394061973310915, 0.09250475032599242, 0.16513273021186267, 0.5003780750696473, 0.36812103599743656, 0.5612214766371684, 0.854494579242529, 0.15157192343155457, 0.06049037314113048, 9.203830205647506, 0.001133201494140691, 0.021656362307989386, 0.34887257955831097, 0.2509865121584474, 0.9688372667081347, 0.2666711793276705, 0.0007572729715136562, 55.51304604471094]
Predicted highest Y value: [197.8]


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 100000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)


In [None]:
pip install bayesian-optimization


Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [None]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization

# Define the objective function to maximize (in this case, the negative mean squared error)
def rf_objective_function(learning_rate, n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
    model = RandomForestRegressor(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        max_features=max_features,
        random_state=1
    )
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    mse = -1 * np.mean((Y_test - y_pred) ** 2)  # Negative mean squared error (to maximize)
    return mse

# Define the parameter space for Bayesian optimization
rf_parameter_space = {
    'learning_rate': (0.001, 0.1),
    'n_estimators': (10, 200),
    'max_depth': (3, 20),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
    'max_features': (0.1, 1.0)
}

# Perform Bayesian optimization
rf_optimizer = BayesianOptimization(f=rf_objective_function, pbounds=rf_parameter_space, random_state=1)
rf_optimizer.maximize(init_points=5, n_iter=50)

# Get the best hyperparameters
best_rf_params = rf_optimizer.max['params']

print('Best hyperparameters for RandomForestRegressor:')
print(best_rf_params)


|   iter    |  target   | learni... | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-467.2   [0m | [0m0.04229  [0m | [0m15.25    [0m | [0m0.1001   [0m | [0m3.721    [0m | [0m4.642    [0m | [0m27.54    [0m |
| [95m2        [0m | [95m-343.9   [0m | [95m0.01944  [0m | [95m8.875    [0m | [95m0.4571   [0m | [95m5.849    [0m | [95m9.546    [0m | [95m140.2    [0m |
| [0m3        [0m | [0m-584.3   [0m | [0m0.02124  [0m | [0m17.93    [0m | [0m0.1246   [0m | [0m7.034    [0m | [0m9.511    [0m | [0m116.2    [0m |
| [0m4        [0m | [0m-382.1   [0m | [0m0.0149   [0m | [0m6.368    [0m | [0m0.8207   [0m | [0m9.714    [0m | [0m7.642    [0m | [0m141.5    [0m |
| [95m5        [0m | [95m-321.5   [0m | [95m0.08776  [0m | [95m18.21    [0m | [95m0.1765   [0m | [95m1.351    [0m | [95m5.057    [0m |

In [None]:
from catboost import CatBoostRegressor
from bayes_opt import BayesianOptimization

# Define the objective function to maximize (in this case, the negative mean squared error)
def catboost_objective_function(learning_rate, depth, bagging_temperature, l2_leaf_reg):
    model = CatBoostRegressor(
        learning_rate=learning_rate,
        depth=int(depth),
        bagging_temperature=bagging_temperature,
        l2_leaf_reg=l2_leaf_reg,
        verbose=0,
        random_seed=1
    )
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    mse = -1 * np.mean((Y_test - y_pred) ** 2)  # Negative mean squared error (to maximize)
    return mse

# Define the parameter space for Bayesian optimization
catboost_parameter_space = {
    'learning_rate': (0.001, 0.1),
    'depth': (3, 10),
    'bagging_temperature': (0.0, 10.0),
    'l2_leaf_reg': (0.0, 5.0)
}

# Perform Bayesian optimization
catboost_optimizer = BayesianOptimization(f=catboost_objective_function, pbounds=catboost_parameter_space, random_state=1)
catboost_optimizer.maximize(init_points=5, n_iter=50)

# Get the best hyperparameters
best_catboost_params = catboost_optimizer.max['params']

print('Best hyperparameters for CatBoostRegressor:')
print(best_catboost_params)


|   iter    |  target   | baggin... |   depth   | l2_lea... | learni... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-216.4   [0m | [0m4.17     [0m | [0m8.042    [0m | [0m0.0005719[0m | [0m0.03093  [0m |
| [0m2        [0m | [0m-249.3   [0m | [0m1.468    [0m | [0m3.646    [0m | [0m0.9313   [0m | [0m0.03521  [0m |
| [95m3        [0m | [95m-215.8   [0m | [95m3.968    [0m | [95m6.772    [0m | [95m2.096    [0m | [95m0.06884  [0m |
| [0m4        [0m | [0m-226.7   [0m | [0m2.045    [0m | [0m9.147    [0m | [0m0.1369   [0m | [0m0.06738  [0m |
| [0m5        [0m | [0m-219.4   [0m | [0m4.173    [0m | [0m6.911    [0m | [0m0.7019   [0m | [0m0.02061  [0m |
| [0m6        [0m | [0m-216.1   [0m | [0m6.381    [0m | [0m9.011    [0m | [0m2.8      [0m | [0m0.1      [0m |
| [0m7        [0m | [0m-221.8   [0m | [0m6.833    [0m | [0m5.571    [0m | [0m5.0      [0m | [0m0.1     

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(n_estimators=1000, random_state=1)

# Train the model
rfr_model.fit(X_train, Y_train)

# Feature selection based on feature importance
feature_selector = SelectFromModel(rfr_model, threshold='median')
feature_selector.fit(X_train, Y_train)
X_train_selected = feature_selector.transform(X_train)
X_test_selected = feature_selector.transform(X_test)

# Fit the model on selected features
rfr_model_selected = RandomForestRegressor(n_estimators=1000, random_state=1)
rfr_model_selected.fit(X_train_selected, Y_train)

# Predict the target variable
optimal_rfr_feature_values = X_test_selected[0]  # Use any data point from test set for prediction
optimal_rfr_y_value = rfr_model_selected.predict([optimal_rfr_feature_values])[0]

print('Optimal feature values for RandomForestRegressor:', optimal_rfr_feature_values)
print('Predicted optimal Y value for RandomForestRegressor:', optimal_rfr_y_value)


Optimal feature values for RandomForestRegressor: [9.645e+01 2.500e-01 2.000e-01 7.500e-02 7.500e-02 0.000e+00 3.000e-01
 0.000e+00 2.500e+00 7.500e-02 1.100e+01]
Predicted optimal Y value for RandomForestRegressor: 91.616


In [None]:
from catboost import CatBoostRegressor

# Initialize the CatBoostRegressor model
catboost_model = CatBoostRegressor(learning_rate=0.05, depth=6, iterations=100, random_state=1, verbose=0)

# Train the model
catboost_model.fit(X_train, Y_train)

# Get feature importance
feature_importance = catboost_model.get_feature_importance()

# Select top features based on importance
top_features_indices = feature_importance.argsort()[-5:][::-1]  # Select top 5 features
X_train_top_features = X_train[:, top_features_indices]
X_test_top_features = X_test[:, top_features_indices]

# Fit the model on selected features
catboost_model_top_features = CatBoostRegressor(learning_rate=0.05, depth=6, iterations=100, random_state=1, verbose=0)
catboost_model_top_features.fit(X_train_top_features, Y_train)

# Predict the target variable
optimal_catboost_feature_values = X_test_top_features[0]  # Use any data point from test set for prediction
optimal_catboost_y_value = catboost_model_top_features.predict([optimal_catboost_feature_values])[0]

print('Optimal feature values for CatBoostRegressor:', optimal_catboost_feature_values)
print('Predicted optimal Y value for CatBoostRegressor:', optimal_catboost_y_value)


Optimal feature values for CatBoostRegressor: [9.645e+01 2.500e+00 1.100e+01 7.500e-02 7.500e-02]
Predicted optimal Y value for CatBoostRegressor: 105.02546351595443


In [None]:
pip install evolutionary_search

[31mERROR: Could not find a version that satisfies the requirement evolutionary_search (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for evolutionary_search[0m[31m
[0m

In [None]:
from sklearn.ensemble import RandomForestRegressor
from evolutionary_search import EvolutionaryAlgorithmSearchCV

# Define the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Define the parameter grid for feature selection using genetic algorithms
param_grid = {
    'n_estimators': [100],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize and fit EvolutionaryAlgorithmSearchCV for feature selection
evolutionary_search = EvolutionaryAlgorithmSearchCV(estimator=rfr_model,
                                                    params=param_grid,
                                                    scoring='neg_mean_squared_error',
                                                    cv=5,
                                                    verbose=1,
                                                    population_size=50,
                                                    gene_mutation_prob=0.10,
                                                    gene_crossover_prob=0.5,
                                                    tournament_size=3,
                                                    generations_number=10)
evolutionary_search.fit(X_train, Y_train)

# Get the best estimator after feature selection
best_rfr_model = evolutionary_search.best_estimator_

# Predict the target variable
optimal_rfr_feature_values = X_test[0]  # Use any data point from test set for prediction
optimal_rfr_y_value = best_rfr_model.predict([optimal_rfr_feature_values])[0]

print('Optimal feature values for RandomForestRegressor:', optimal_rfr_feature_values)
print('Predicted optimal Y value for RandomForestRegressor:', optimal_rfr_y_value)


ModuleNotFoundError: No module named 'evolutionary_search'

In [None]:
from catboost import CatBoostRegressor

# Define the CatBoostRegressor model
catboost_model = CatBoostRegressor(learning_rate=0.05, depth=6, iterations=100, random_state=1, verbose=0)

# Fit the CatBoostRegressor model
catboost_model.fit(X_train, Y_train)

# Get feature importance
feature_importance = catboost_model.get_feature_importance()

# Sort features based on importance
sorted_features_indices = feature_importance.argsort()[::-1]

# Select top features based on importance
top_features_indices = sorted_features_indices[:5]  # Select top 5 features
X_train_top_features = X_train[:, top_features_indices]
X_test_top_features = X_test[:, top_features_indices]

# Fit the model on selected features
catboost_model_top_features = CatBoostRegressor(learning_rate=0.05, depth=6, iterations=100, random_state=1, verbose=0)
catboost_model_top_features.fit(X_train_top_features, Y_train)

# Predict the target variable
optimal_catboost_feature_values = X_test_top_features[0]  # Use any data point from test set for prediction
optimal_catboost_y_value = catboost_model_top_features.predict([optimal_catboost_feature_values])[0]

print('Optimal feature values for CatBoostRegressor:', optimal_catboost_feature_values)
print('Predicted optimal Y value for CatBoostRegressor:', optimal_catboost_y_value)


In [None]:
pip install tpot


Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m81.9/87.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>=1.4.1 (from tpot)
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m


In [None]:


from tpot import TPOTRegressor

# Define the TPOTRegressor model
tpot_model = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=1)

# Fit the TPOTRegressor model
tpot_model.fit(X_train, Y_train)

# Predict the target variable
optimal_feature_values = X_test[0]  # Use any data point from test set for prediction
optimal_y_value = tpot_model.predict([optimal_feature_values])[0]

print('Optimal feature values:', optimal_feature_values)
print('Predicted optimal Y value:', optimal_y_value)



ImportError: cannot import name '_fit_context' from 'sklearn.base' (/usr/local/lib/python3.10/dist-packages/sklearn/base.py)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rfr_model = RandomForestRegressor(random_state=1)

# Fit the RandomForestRegressor model with the training data
rfr_model.fit(X, Y)

# Define the number of random samples to generate
num_samples = 100000  # Adjust this value based on your preference

# Initialize variables to store the maximum predicted value and its corresponding feature values
max_predicted_value = float('-inf')
optimal_feature_values = None

# Generate random samples of feature values
for _ in range(num_samples):
    # Randomly sample feature values from the feature space
    features_random = [np.random.uniform(min_value, max_value) for min_value, max_value in zip(X_train.min(axis=0), X_train.max(axis=0))]

    # Predict the target variable using the RandomForestRegressor model
    predicted_value = rfr_model.predict([features_random])

    # Check if the predicted value is higher than the current maximum
    if predicted_value > max_predicted_value:
        max_predicted_value = predicted_value
        optimal_feature_values = features_random

print('Optimal feature values for highest predicted Y value:', optimal_feature_values)
print('Predicted highest Y value:', max_predicted_value)