In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import os


In [24]:
# Directory where PCA results are saved
pca_dir = "../../data/pca_data"

# output directories for plots and predictions exist
plot_dir = "../../images/gradient_boosting_plots"
predicted_data_dir = "../../data/predicted_data"
os.makedirs(plot_dir, exist_ok=True)
os.makedirs(predicted_data_dir, exist_ok=True)

# Get a list of all PCA result files
pca_files = [f for f in os.listdir(pca_dir) if f.endswith('_pca_with_target.csv')]

# Dictionary to store results for each basin
results = {}

# Process each PCA result file
for file in pca_files:
    basin_name = file.replace('_pca_with_target.csv', '')  
    pca_df = pd.read_csv(f"{pca_dir}/{file}")

    # Separate data with known and unknown 'Li' values
    known_data = pca_df.dropna(subset=['Li'])
    unknown_data = pca_df[pca_df['Li'].isnull()]

    # Prepare training and testing data from known data
    X_known = known_data[['PC1', 'PC2', 'PC3']]
    y_known = known_data['Li']
    X_train, X_test, y_train, y_test = train_test_split(X_known, y_known, test_size=0.2, random_state=42)

    # Initialize and train Gradient Boosting Regressor
    gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    gbr.fit(X_train, y_train)

    # Predict and evaluate on known test data
    y_pred = gbr.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    explained_variance = explained_variance_score(y_test, y_pred)

    # Store results for known data evaluation
    results[basin_name] = {'MSE': mse, 'R2': r2, 'MAE': mae, 'RMSE': rmse, 'Explained Variance': explained_variance}
    print(f"Basin: {basin_name} - MSE: {mse}, R2: {r2}, MAE: {mae}, RMSE: {rmse}, Explained Variance: {explained_variance}")

    # Plot actual vs predicted and save the plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
    plt.xlabel('Actual Li concentrations')
    plt.ylabel('Predicted Li concentrations')
    plt.title(f'Actual vs. Predicted for {basin_name}')
    plot_path = os.path.join(plot_dir, f"{basin_name}_actual_vs_predicted.png")
    plt.savefig(plot_path)
    plt.close()

    # Predict unknown 'Li' values if there are any
    if not unknown_data.empty:
        X_unknown = unknown_data[['PC1', 'PC2', 'PC3']]
        y_unknown_pred = gbr.predict(X_unknown)
        unknown_data['Predicted_Li'] = y_unknown_pred

        # Save predicted data for unknown values
        predicted_data_path = os.path.join(predicted_data_dir, f"{basin_name}_predicted.csv")
        unknown_data.to_csv(predicted_data_path, index=False)

        # Plot predictions for unknown data and save the plot
        plt.figure(figsize=(10, 6))
        plt.hist(y_unknown_pred, bins=30, alpha=0.75)
        plt.xlabel('Predicted Lithium Concentration')
        plt.ylabel('Frequency')
        plt.title(f'Distribution of Predicted Lithium Concentrations for New Samples in {basin_name}')
        plot_hist_path = os.path.join(plot_dir, f"{basin_name}_predicted_distribution.png")
        plt.savefig(plot_hist_path)
        plt.close()

# Convert results to a DataFrame for better visualization or further analysis
results_df = pd.DataFrame(results).T
print(results_df)

Basin: Anadarko - MSE: 1309.9538903044497, R2: 0.8684136406905676, MAE: 11.248022127519866, RMSE: 36.19328515490753, Explained Variance: 0.8684251960665268


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Appalachian - MSE: 2354.656045247471, R2: 0.35552918999390193, MAE: 28.666017077120767, RMSE: 48.524798250456136, Explained Variance: 0.3979191774918227


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Fort Worth - MSE: 554.614619373972, R2: 0.5474510250133635, MAE: 14.012876361695188, RMSE: 23.5502573101436, Explained Variance: 0.6198417930021616


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Great Plains - MSE: 40677.970868721124, R2: -0.09035102540826134, MAE: 58.26975982781972, RMSE: 201.68780545367915, Explained Variance: -0.0024855926081646107


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Gulf Coast - MSE: 848.4971364549327, R2: 0.849991513834392, MAE: 16.556366763816918, RMSE: 29.12897417443554, Explained Variance: 0.850755859192776


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Illinois - MSE: 4.865393634864008, R2: 0.9745533784413706, MAE: 1.769394736003657, RMSE: 2.20576373051694, Explained Variance: 0.9746497037410601


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Michigan - MSE: 108.1643574624111, R2: 0.7294033583611144, MAE: 7.115011225390476, RMSE: 10.400209491275216, Explained Variance: 0.7295658494683772


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Oklahoma Platform - MSE: 59.58170086012559, R2: 0.8103263498715882, MAE: 6.425042122393865, RMSE: 7.718918373718276, Explained Variance: 0.8108629230184812


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Pacific - MSE: 0.8560837999990436, R2: 0.9173286518580159, MAE: 0.6181349160417617, RMSE: 0.9252479667629881, Explained Variance: 0.9174416853529569


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Permian - MSE: 2273.0706274329623, R2: 0.5530002591334026, MAE: 12.479729764348848, RMSE: 47.67673046081246, Explained Variance: 0.560220710682347


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Rocky Mountain - MSE: 59.1781272204214, R2: 0.724132696846849, MAE: 3.8300650029023933, RMSE: 7.692732103773106, Explained Variance: 0.724799283167036


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


Basin: Williston - MSE: 173.07186889810745, R2: 0.9249108807464392, MAE: 9.403714781806439, RMSE: 13.15567819985376, Explained Variance: 0.9276588754374996


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_data['Predicted_Li'] = y_unknown_pred


                            MSE        R2        MAE        RMSE  \
Anadarko            1309.953890  0.868414  11.248022   36.193285   
Appalachian         2354.656045  0.355529  28.666017   48.524798   
Fort Worth           554.614619  0.547451  14.012876   23.550257   
Great Plains       40677.970869 -0.090351  58.269760  201.687805   
Gulf Coast           848.497136  0.849992  16.556367   29.128974   
Illinois               4.865394  0.974553   1.769395    2.205764   
Michigan             108.164357  0.729403   7.115011   10.400209   
Oklahoma Platform     59.581701  0.810326   6.425042    7.718918   
Pacific                0.856084  0.917329   0.618135    0.925248   
Permian             2273.070627  0.553000  12.479730   47.676730   
Rocky Mountain        59.178127  0.724133   3.830065    7.692732   
Williston            173.071869  0.924911   9.403715   13.155678   

                   Explained Variance  
Anadarko                     0.868425  
Appalachian                  0.3979