In [3]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

In [4]:
# Load the dataset with all basins
data = pd.read_csv("../../data/df_output/df_cleaned.csv")

In [6]:
# including all elements 

# Filter the dataset for only Gulf Coast Basin data
gulf_coast_data = data[data['BASIN_CATEGORY'] == 'Gulf Coast']

# Initial features and targets
features = ['TDS', 'Cl', 'Na', 'K', 'CHARGEBAL']
targets = ['B', 'Ba', 'Br', 'HCO3', 'FeTot', 'SO4', 'Mg', 'Ca', 'Sr', 'Zn', 'DEPTHUPPER']

# Sort targets based on the number of missing values
targets_sorted = gulf_coast_data [targets].isnull().sum().sort_values().index.tolist()


# Preprocess features
numeric_features = features  # Only numeric features are used

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])

# Initialize the full feature set with existing features
full_features = features.copy() 

#Define a dictionary to hold different models and their grid search parameters
model_params = {
    'RF': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': [100, 200, 300],
            'regressor__max_depth': [None, 10, 20, 30],
            'regressor__min_samples_split': [2, 5, 10],
            'regressor__min_samples_leaf': [1, 2, 4]
        }
    },
    'GB': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': [100, 200, 300],
            'regressor__learning_rate': [0.01, 0.1, 0.2],
            'regressor__max_depth': [3, 5, 10]
        }
    },
    'MLP': {
        'model': MLPRegressor(random_state=42),
        'params': {
            'regressor__hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
            'regressor__activation': ['relu', 'tanh'],
            'regressor__solver': ['adam', 'sgd'],
            'regressor__alpha': [0.0001, 0.001, 0.01],
            'regressor__learning_rate_init': [0.001, 0.01]
        }
    },
    'SVR': {
        'model': SVR(),
        'params': {
            'regressor__C': [1, 10, 100],
            'regressor__gamma': ['scale', 'auto'],
            'regressor__kernel': ['rbf', 'linear', 'poly'],
            'regressor__epsilon': [0.01, 0.1, 0.2]
        }
    },
    'XGB': {
        'model': XGBRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': [100, 200, 300],
            'regressor__max_depth': [3, 5, 7],
            'regressor__learning_rate': [0.01, 0.1, 0.2],
            'regressor__subsample': [0.7, 0.8, 0.9],
            'regressor__colsample_bytree': [0.7, 0.8, 0.9]
        }
    }
}


# Store the best model and performance metrics for each target
best_models = {}
performance_data = []

for target in targets_sorted:
    best_score = float('inf')
    best_model = None
    best_model_name = ""

    current_data = gulf_coast_data.dropna(subset=[target])
    X = current_data[full_features]
    y = current_data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Loop over each model type
    for model_name, mp in model_params.items():
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', mp['model'])
        ])
        grid_search = GridSearchCV(model, mp['params'], cv=5, scoring='neg_mean_squared_error', verbose=0)
        grid_search.fit(X_train, y_train)
        
        score = -grid_search.best_score_
        if score < best_score:
            best_score = score
            best_model = grid_search.best_estimator_
            best_model_name = model_name

    # Store the best model and its name
    best_models[target] = best_model
    y_pred = best_models[target].predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    explained_var = explained_variance_score(y_test, y_pred)

    # Collect performance data
    performance_data.append({
        'Element': target,
        'Model': best_model_name,
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Explained Variance': explained_var
    })

    print(f"Best Model for {target}: {best_model_name} with MSE = {mse:.3f}, MAE = {mae:.3f}, R2 = {r2:.3f}, Explained Variance = {explained_var:.3f}")

    # Update the dataset with predicted values for the current target
    gulf_coast_data.loc[gulf_coast_data[target].isnull(), target] = best_models[target].predict(gulf_coast_data[full_features][gulf_coast_data[target].isnull()])

    # Add the current target to the full feature list for subsequent iterations
    if target not in full_features:
        full_features.append(target)

# Create a DataFrame from the collected performance data
performance_df = pd.DataFrame(performance_data)

# Save the DataFrame to a CSV file
performance_df.to_csv('../../data/performance_comparison/MTSC_all_targets_best_model_performance.csv', index=False)
print("Model performance metrics saved to 'model_performance.csv'.")


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for Ca: MLPRegressor with MSE = 2586448.771, MAE = 469.937, R2 = 0.976, Explained Variance = 0.976


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  activations[i + 1] += self.intercepts_[i]
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\da

Best Model for Mg: RandomForestRegressor with MSE = 986185.921, MAE = 275.055, R2 = 0.502, Explained Variance = 0.502


  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for HCO3: MLPRegressor with MSE = 150744.819, MAE = 219.998, R2 = 0.659, Explained Variance = 0.672


  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for DEPTHUPPER: RandomForestRegressor with MSE = 6742096.478, MAE = 1972.610, R2 = 0.295, Explained Variance = 0.296


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for SO4: RandomForestRegressor with MSE = 316532.605, MAE = 207.383, R2 = 0.605, Explained Variance = 0.605


  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for FeTot: MLPRegressor with MSE = 59121.151, MAE = 82.585, R2 = 0.264, Explained Variance = 0.265


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for Ba: RandomForestRegressor with MSE = 25933.843, MAE = 75.734, R2 = 0.211, Explained Variance = 0.213


  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  activations[i + 1] += self.intercepts_[i]
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\da

Best Model for Sr: MLPRegressor with MSE = 368694.572, MAE = 232.289, R2 = 0.642, Explained Variance = 0.642


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for Br: RandomForestRegressor with MSE = 655329.380, MAE = 383.492, R2 = 0.776, Explained Variance = 0.777


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for B: MLPRegressor with MSE = 1927.771, MAE = 27.913, R2 = 0.627, Explained Variance = 0.627


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

Best Model for Zn: RandomForestRegressor with MSE = 2687.866, MAE = 27.192, R2 = 0.593, Explained Variance = 0.593
Model performance metrics saved to 'model_performance.csv'.


In [14]:
# Prepare data for Lithium prediction
X = gulf_coast_data[full_features]
y = gulf_coast_data['Li']

# Remove NA values from our features and target
X = X[y.notna()]
y = y.dropna()

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def model_params_to_string(model):
    try:
        params = model.get_params()
        return "; ".join(f"{k}={v}" for k, v in params.items())
    except AttributeError:
        return "No parameters available"

# Define a dictionary to hold performances for Lithium predictions
li_model_performances = []

# Iterate over each model type for Lithium prediction
for model_name, mp in model_params.items():
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', mp['model'])
    ])
    grid_search = GridSearchCV(model, mp['params'], cv=5, scoring='neg_mean_squared_error', verbose=0)
    grid_search.fit(X_train, y_train)

    # Select the best model from the grid search
    best_li_model = grid_search.best_estimator_
    
    # Predict Lithium on the test set
    y_pred_li = best_li_model.predict(X_test)

    # Calculate performance metrics
    li_mse = mean_squared_error(y_test, y_pred_li)
    li_mae = mean_absolute_error(y_test, y_pred_li)
    li_r2 = r2_score(y_test, y_pred_li)
    li_explained_variance = explained_variance_score(y_test, y_pred_li)

     # Append model details and performance metrics to the list
    li_model_performances.append({
        'Model': model_name,
        'Parameters': model_params_to_string(best_li_model),
        'MSE': li_mse,
        'MAE': li_mae,
        'R2': li_r2,
        'Explained Variance': li_explained_variance
    })

    
    # Output the performance metrics for each model
    print(f'{model_name} Performance for Li Prediction: MSE = {li_mse:.3f}, MAE = {li_mae:.3f}, R2 = {li_r2:.3f}, Explained Variance = {li_explained_variance:.3f}')

    gulf_coast_data_updated = gulf_coast_data.copy()
    # Predict Lithium where it's currently NaN using the current model
    X_unknown = gulf_coast_data_updated[full_features][gulf_coast_data_updated['Li'].isnull()]
    predicted_Li_values = best_li_model.predict(X_unknown)

    # Fill in the missing values with the predictions
    
    gulf_coast_data_updated.loc[gulf_coast_data['Li'].isnull(), 'Li_predicted'] = predicted_Li_values

    # Save the updated DataFrame to a new CSV file
    csv_file_path = f'../../data/predicted_data/multi_target_sequential_chain/{model_name}_gulf_coast_data_with_predicted_Li.csv'
    gulf_coast_data_updated.to_csv(csv_file_path, index=False)
    print(f"Predicted Lithium values saved to '{csv_file_path}'.")

# Convert the performance data into a DataFrame
performance_df = pd.DataFrame(li_model_performances)

# Save the DataFrame to a CSV file
performance_df.to_csv('../../data/performance_comparison/MTSC_Li_model_performance_and_architecture.csv', index=False)
print("Lithium model performance metrics and details saved to 'Li_model_performance_and_architecture.csv'.")



RandomForestRegressor Performance for Li Prediction: MSE = 2087.347, MAE = 18.863, R2 = 0.631, Explained Variance = 0.645
Predicted Lithium values saved to '../../data/predicted_data/multi_target_sequential_chain/RandomForestRegressor_gulf_coast_data_with_predicted_Li.csv'.
GradientBoostingRegressor Performance for Li Prediction: MSE = 1460.208, MAE = 23.090, R2 = 0.742, Explained Variance = 0.749
Predicted Lithium values saved to '../../data/predicted_data/multi_target_sequential_chain/GradientBoostingRegressor_gulf_coast_data_with_predicted_Li.csv'.


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

MLPRegressor Performance for Li Prediction: MSE = 1420.581, MAE = 18.671, R2 = 0.749, Explained Variance = 0.763
Predicted Lithium values saved to '../../data/predicted_data/multi_target_sequential_chain/MLPRegressor_gulf_coast_data_with_predicted_Li.csv'.
SVR Performance for Li Prediction: MSE = 1143.157, MAE = 14.472, R2 = 0.798, Explained Variance = 0.798
Predicted Lithium values saved to '../../data/predicted_data/multi_target_sequential_chain/SVR_gulf_coast_data_with_predicted_Li.csv'.
XGBRegressor Performance for Li Prediction: MSE = 1177.481, MAE = 21.198, R2 = 0.792, Explained Variance = 0.796
Predicted Lithium values saved to '../../data/predicted_data/multi_target_sequential_chain/XGBRegressor_gulf_coast_data_with_predicted_Li.csv'.
Lithium model performance metrics and details saved to 'Li_model_performance_and_architecture.csv'.


In [8]:
# All targets MODEL ARCHITECTURE and performance saved separately in txt files


def save_model_details(model, model_name, filename):
    with open(filename, 'w') as file:
        # Write the type of model
        file.write(f'Model Type: {model_name}\n')
        # Depending on the model type, write different details
        if hasattr(model, 'get_params'):
            # For most sklearn models
            params = model.get_params()
            file.write(f'Parameters: \n{params}\n')
        if model_name == 'MLPRegressor':
            # Additional details for neural networks if needed
            file.write(f'Layers: {model.hidden_layer_sizes}\n')
            file.write(f'Activation: {model.activation}\n')

# Loop through all models to save their details
for target, model in best_models.items():
    model_name = type(model).__name__
    filename = f'model_details_{model_name}_{target}.txt'
    save_model_details(model, model_name, filename)

print("Model details saved.")

Model details saved.


In [15]:
# All targets Model Architecture and Performance saved in one CSV


# Function to convert model configuration to string
def model_config_to_string(model):
    param_details = ""
    if hasattr(model, 'get_params'):
        params = model.get_params()
        for key, value in params.items():
            param_details += f"{key}: {value}, "
    return param_details.strip(', ')

# Initialize a list to store performance data
performance_data = []

for target in targets_sorted:
    best_score = float('inf')
    best_model = None
    best_model_name = ""

    current_data = gulf_coast_data.dropna(subset=[target])
    X = current_data[full_features]
    y = current_data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Loop over each model type
    for model_name, mp in model_params.items():
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', mp['model'])
        ])
        grid_search = GridSearchCV(model, mp['params'], cv=5, scoring='neg_mean_squared_error', verbose=0)
        grid_search.fit(X_train, y_train)
        
        score = -grid_search.best_score_
        if score < best_score:
            best_score = score
            best_model = grid_search.best_estimator_
            best_model_name = model_name

    # Predict and evaluate
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    explained_var = explained_variance_score(y_test, y_pred)

    # Collect performance data with model configurations
    performance_data.append({
        'Element': target,
        'Model': best_model_name,
        'Model Parameters': model_config_to_string(best_model),
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Explained Variance': explained_var
    })

# Create a DataFrame from the collected performance data
performance_df = pd.DataFrame(performance_data)

# Save the DataFrame to a CSV file
performance_df.to_csv('../../data/performance_comparison/all_targets_model_performance_details.csv', index=False)
print("Model performance metrics and details saved to 'model_performance_details.csv'.")

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  activations[i + 1] += self.intercepts_[i]
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_a

In [17]:
#split dataset into two parts (Li<200 and Li>100)

import joblib
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from math import sqrt


# Set 'IDUSGS' as the index if it's not already
gulf_coast_data.set_index('IDUSGS', inplace=True)

# Create two overlapping datasets:
X1 = gulf_coast_data[gulf_coast_data['Li'] < 200]
y1 = X1['Li']

X2 = gulf_coast_data[gulf_coast_data['Li'] > 100]
y2 = X2['Li']

# Define dictionaries to hold performances for Lithium predictions in both datasets
li_model_performances_1 = {}
li_model_performances_2 = {}

# Split the data for training and testing
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)


# Function to train models and evaluate performance
def train_evaluate_save(X_train, y_train, X_test, y_test, model_params, segment_label):
    best_score = float('inf')
    best_model = None
    best_model_name = ""
    
    for model_name, mp in model_params.items():
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', mp['model'])
        ])
        grid_search = GridSearchCV(model, mp['params'], cv=5, scoring='neg_mean_squared_error', verbose=0)
        grid_search.fit(X_train, y_train)

        # Select the best model from the grid search based on MSE
        if -grid_search.best_score_ < best_score:
            best_score = -grid_search.best_score_
            best_model = grid_search.best_estimator_
            best_model_name = model_name

        print(f'{model_name} {segment_label} - MSE: {-grid_search.best_score_:.3f}')

    # Predict Lithium on the test set using the best model
    y_pred_li = best_model.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred_li)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred_li)
    explained_variance = explained_variance_score(y_test, y_pred_li)
    
    # Print performance metrics
    print(f'Performance for {segment_label} using {best_model_name}: MSE = {mse:.3f}, RMSE = {rmse:.3f}, R2 = {r2:.3f}, Explained Variance = {explained_variance:.3f}')

    # Save the best model
    safe_segment_label = segment_label.replace(" ", "_").replace("<", "lt_").replace(">", "gt_")  
    model_filename = f'best_model_{safe_segment_label}.joblib'
    joblib.dump(best_model, model_filename)
    print(f"Saved {best_model_name} as the best model for {segment_label} to {model_filename}")
    
    return y_pred_li, best_model, best_model_name

# Train, evaluate, and save for the first dataset (Li < 200)
y_pred_1, best_model_1, best_model_name_1 = train_evaluate_save(X1_train, y1_train, X1_test, y1_test, model_params, "Li < 200")
gulf_coast_data['Li_predicted'] = None  # Initialize column
gulf_coast_data.loc[X1_test.index, 'Li_predicted'] = y_pred_1
gulf_coast_data.to_csv('../../data/predicted_data/gulf_coast_data_Li_under_200.csv', index=True)

# Train, evaluate, and save for the second dataset (Li > 100)
y_pred_2, best_model_2, best_model_name_2 = train_evaluate_save(X2_train, y2_train, X2_test, y2_test, model_params, "Li > 100")
gulf_coast_data.loc[X2_test.index, 'Li_predicted'] = y_pred_2
gulf_coast_data.to_csv('../../data/predicted_data/gulf_coast_data_Li_over_100.csv', index=True)

print("Models trained and results saved.")


RandomForestRegressor Li < 200 - MSE: 493.439
GradientBoostingRegressor Li < 200 - MSE: 494.449


  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

MLPRegressor Li < 200 - MSE: 435.696
SVR Li < 200 - MSE: 515.857
XGBRegressor Li < 200 - MSE: 475.793
Performance for Li < 200 using MLPRegressor: MSE = 452.371, RMSE = 21.269, R2 = 0.657, Explained Variance = 0.657
Saved MLPRegressor as the best model for Li < 200 to best_model_Li_lt__200.joblib


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gulf_coast_data['Li_predicted'] = None  # Initialize column


RandomForestRegressor Li > 100 - MSE: 18699.684
GradientBoostingRegressor Li > 100 - MSE: 19248.775


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "c:\Users\darvarir\Anaconda3\lib\site-packages\sklearn\u

MLPRegressor Li > 100 - MSE: 23138.694
SVR Li > 100 - MSE: 31903.817
XGBRegressor Li > 100 - MSE: 24423.981
Performance for Li > 100 using RandomForestRegressor: MSE = 71682.005, RMSE = 267.735, R2 = -0.030, Explained Variance = -0.013
Saved RandomForestRegressor as the best model for Li > 100 to best_model_Li_gt__100.joblib
Models trained and results saved.
