# Import Dependencies

In [3]:
import pandas as pd
import numpy as np
import sklearn
import requests
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
import scipy.stats as st
from sklearn.linear_model import LinearRegression
import seaborn as sns
import hvplot.pandas
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from io import BytesIO
import os

# Pull Excel file from the URL and write it into a DataFrame

In [4]:
# URL for the Excel file
file_url = 'https://www.sciencebase.gov/catalog/file/get/64fa1e71d34ed30c2054ea11?f=__disk__e7%2Fef%2F17%2Fe7ef17fcb71c49e2241da4139ed775f8e328bdab'

try:
    # Download the file with a timeout of 10 seconds
    response = requests.get(file_url, timeout=10)
    
    # Raise an error if the status code is not 200 (OK)
    response.raise_for_status()

    # Read Excel file into a DataFrame
    with BytesIO(response.content) as bio:
        df = pd.read_excel(bio)

    # Display the DataFrame
    print(df.head())

except requests.exceptions.RequestException as e:
    print("Error downloading the file:", e)

   IDUSGS  IDORIG PLAYTYPE   WELLTYPE        BASIN FORMSIMPLE      TDS  \
0       1   WE-B3    Shale  Shale Gas  Appalachian  Marcellus  96570.0   
1       2   WE-B5    Shale  Shale Gas  Appalachian  Marcellus  96570.0   
2       3   WE-B7    Shale  Shale Gas  Appalachian  Marcellus  96570.0   
3       4   WE-B9    Shale  Shale Gas  Appalachian  Marcellus  96570.0   
4       5  WE-B13    Shale  Shale Gas  Appalachian  Marcellus  96570.0   

   LATITUDE  LONGITUDE  FIPCODE  ... I129 Rn222 Ra226  Ra228 MICROBES  \
0   40.2354   -79.4704  42129.0  ...  NaN   NaN   NaN    NaN      NaN   
1   40.2354   -79.4704  42129.0  ...  NaN   NaN   NaN    NaN      NaN   
2   40.2354   -79.4704  42129.0  ...  NaN   NaN   NaN    NaN      NaN   
3   40.2354   -79.4704  42129.0  ...  NaN   NaN   NaN    NaN      NaN   
4   40.2354   -79.4704  42129.0  ...  NaN   NaN   NaN    NaN      NaN   

   CHARGEBAL                        REMARKS           IDDB SOURCE  \
0      100.0  Sr from Chapman et al. (2012)  PH

In [5]:
# Display the DataFrame
df.head()

Unnamed: 0,IDUSGS,IDORIG,PLAYTYPE,WELLTYPE,BASIN,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,FIPCODE,...,I129,Rn222,Ra226,Ra228,MICROBES,CHARGEBAL,REMARKS,IDDB,SOURCE,REFERENCE
0,1,WE-B3,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,42129.0,...,,,,,,100.0,Sr from Chapman et al. (2012),PHANMARCELLUS,,"(Phan and others, 2016)"
1,2,WE-B5,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,42129.0,...,,,,,,100.0,Sr from Chapman et al. (2012),PHANMARCELLUS,,"(Phan and others, 2016)"
2,3,WE-B7,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,42129.0,...,,,,,,100.0,Sr from Chapman et al. (2012),PHANMARCELLUS,,"(Phan and others, 2016)"
3,4,WE-B9,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,42129.0,...,,,,,,100.0,Sr from Chapman et al. (2012),PHANMARCELLUS,,"(Phan and others, 2016)"
4,5,WE-B13,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,42129.0,...,,,,,,100.0,Sr from Chapman et al. (2012),PHANMARCELLUS,,"(Phan and others, 2016)"


# General Cleaning

## Adjust Columns

In [6]:
# List of columns to be removed
columns_to_remove = [
    "IDORIG","FIPCODE","COORDAPX","COORDNEW","STATEFIP","COUNTYFIP","COUNTYORIG","FIELD","FIELDCODE","TOWNRANGE",
    "LOC","OPERATOR","WELLCODE","PERMIT","DATECOMP","ELEVATION","NWIS","GROUP","MEMBER","ERA",
    "EPOCH","LITHOLOGY","POROSITY","TIMESERIES","DAY","DATEANALYS","METHOD","LAB","TEMP_R","PRESSURE","SG",
    "SPGRAV","SPGRAVT","RESIS","RESIST","PHT","EHORP","COND","CONDT","TEMP","TURBIDITY","HEM","MBAS","UNITSORIG",
    "TDSLAB","TDSCALC","TDSDESC","TSS","Ag", "Al", "As", "Au","BO3", "Be", "Bi","Cd", "Co","CO3", "Cr", "Cs", "Cu", "F", 
    "FeIII","FeII","FeS","FeAl","FeAl2O3","Ga","Ge","Hf","Hg","I","Mn","Mo", "N", "NO2", "NO3", "NO3NO2", "NH4", 
    "TKN", "Ni", "OH", "P", "PO4", "Pb", "Rh", "Rb", "S", "SO3", "HS", "Sb","Sc","Se","Si","Sn","Th","Ti","Tl","U","V","W",
    "Y","Zr","La","Ce","Pr","Nd","Sm","Eu","Gd","Tb","Dy","Ho","Er","Tm","Yb","Lu","ACIDITY","ALKALINITY","DIC","DOC","TOC","CYANIDE","BOD",
    "COD","BENZENE","ETHYLBENZ","NAPHTH","PERC","TOLUENE","XYLENE","PHENOLS","ACETATE","BUTYRATE","FORMATE","LACTATE",
    "PROPIONATE","PYRUVATE","VALERATE","ORGACIDS","Ar","CH4","C2H6","CO2","H2","H2S","He","N2","NH3","O2","ALPHA","BETA",
    "dD","H3","d7Li","d11B","d13C","C14","d18O","d34S","d37Cl","K40","d81Br","Sr87Sr86","I129","Rn222","Ra226","Ra228",
    "MICROBES","REMARKS","IDDB","SOURCE","REFERENCE"]

# Remove the specified columns
df_limited_column = df.drop(columns=columns_to_remove, errors='ignore')

# Export the DataFrame as a CSV
df_limited_column.to_csv('../../data/df_output/df_limited_column.csv', index=False)

# Display the DataFrame
df_limited_column.head()

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,BASIN,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,...,FeTot,K,KNa,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL
0,1,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,21.6,,9900.0,,,,100.0
1,2,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,37.0,,16700.0,,,,100.0
2,3,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,45.4,,18300.0,,,,100.0
3,4,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,52.0,,18500.0,,,,100.0
4,5,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,62.1,,22400.0,,,,100.0


### Remove Coal, Injection, and Geothermal Play type
### Remove Injection Well Type

In [7]:
df_limited_column=df_limited_column[~df_limited_column['PLAYTYPE'].isin(['Coal', 'Injection','Geothermal'])]
df_limited_column=df_limited_column[~df_limited_column['WELLTYPE'].isin(['Injection'])]

# Save the DataFrame to a CSV
df_limited_column.to_csv('../../data/df_output/df_limited_column_playtype_welltype.csv',index=False)

# Display the updated merged dataframe
df_limited_column.head()

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,BASIN,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,...,FeTot,K,KNa,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL
0,1,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,21.6,,9900.0,,,,100.0
1,2,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,37.0,,16700.0,,,,100.0
2,3,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,45.4,,18300.0,,,,100.0
3,4,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,52.0,,18500.0,,,,100.0
4,5,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,,62.1,,22400.0,,,,100.0


### Categorize Basins

In [8]:
# Add column "BASIN_CATEGORY" to df_filtered to aid in later filtering

# Define the basin categories
basin_categories = {
    'Anadarko': ['Amarillo Arch', 'Anadarko', 'Anadarko - Chautauqua Platform', 'Anadarko - Southern Oklahoma'],
    'Appalachian': ['Appalachian', 'Black Warrior'],
    'Great Plains':['Wasatch Plateau','Hannah','Wyoming Thrust Belt','Laramie','North Park','Las Animas Arch','Snake River','Sweetgrass Arch','Central Montana Uplift','Great Basin','Raton'],
    'Gulf Coast': ['Arkla', 'Gulf Coast'],
    'Oklahoma Platform': ['Arkoma', 'Arkoma - Chautauqua Platform', 'Central Kansas Uplift', 
                                                'Chautauqua Platform', 'Cherokee', 'Kansas Basins', 'Nemaha Uplift', 
                                                'Sedgwick', 'Southern Oklahoma'],
    'Fort Worth': ['Bend Arch', 'Fort Worth'],
    'Rocky Mountain': ['Big Horn', 'Black Mesa', 'Denver', 'Green River', 'Paradox', 'Piceance', 
                              'Powder River', 'San Juan', 'Uinta', 'Wind River'],
    'Illinois': ['Illinois'],
    'Michigan': ['Michigan'],
    'Permian': ['Palo Duro', 'Permian'],
    'Pacific':['San Joaquin','Sacramento','Los Angeles','Ventura','Cuyama','Santa Maria'],
    'Williston': ['Williston']
}

# Create a reverse mapping from specific basin to category
basin_to_category = {specific: category for category, specifics in basin_categories.items() for specific in specifics}

df_basins_categorized = df_limited_column.copy()

# Map the 'BASIN' column to a new 'BASIN_CATEGORY' column
df_basins_categorized['BASIN_CATEGORY'] = df_basins_categorized['BASIN'].map(basin_to_category)

# Drop rows that don't have a basin category (i.e., not in your list)
df_basins_categorized = df_basins_categorized.dropna(subset=['BASIN_CATEGORY'])

# Save the filtered dataframe to a new CSV file
df_basins_categorized.to_csv('../../data/df_output/df_basins_categorized.csv', index=False)

# Display the df
df_basins_categorized.head()

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,BASIN,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,...,K,KNa,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
0,1,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,21.6,,9900.0,,,,100.0,Appalachian
1,2,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,37.0,,16700.0,,,,100.0,Appalachian
2,3,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,45.4,,18300.0,,,,100.0,Appalachian
3,4,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,52.0,,18500.0,,,,100.0,Appalachian
4,5,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,62.1,,22400.0,,,,100.0,Appalachian


## Technical Cleaning

In [9]:
# Remove rows where TDSUSGS <= 10,000 (to eliminate all produced water with TDS less than brackish water and also the failing analyses)
df_filtered = df_basins_categorized[df_basins_categorized['TDS'] > 10000]

# Save the filtered dataframe to a new CSV file
df_filtered.to_csv('../../data/df_output/df_filtered_TDS.csv', index=False)

# Display the DataFrame
df_filtered.head()

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,BASIN,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,...,K,KNa,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
0,1,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,21.6,,9900.0,,,,100.0,Appalachian
1,2,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,37.0,,16700.0,,,,100.0,Appalachian
2,3,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,45.4,,18300.0,,,,100.0,Appalachian
3,4,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,52.0,,18500.0,,,,100.0,Appalachian
4,5,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,,,62.1,,22400.0,,,,100.0,Appalachian


In [10]:
# Fill NaN values in 'KNa', 'K', 'Na' with zeros for calculation
for col in ['KNa', 'K', 'Na']:
    df_filtered[col].fillna(0, inplace=True)

# First, we'll fill NaN values in 'KNa' and 'K' with zeros for the calculation.
df_filtered['KNa'].fillna(0, inplace=True)
df_filtered['K'].fillna(0, inplace=True)

# Apply conditions to calculate 'Na'
# If 'Na' is missing and both 'KNa' and 'K' are present, populate 'Na' with 'KNa' - 'K'
# If 'Na' is missing and 'KNa' is present but 'K' is not, populate 'Na' with 'KNa'

na_mask = df_filtered['Na'].isna()
na_present = df_filtered['Na'] > 0
kna_present = df_filtered['KNa'] > 0
k_present = df_filtered['K'] > 0
k_missing = df_filtered['K'] == 0

df_filtered.loc[k_missing & na_present & kna_present, 'K'] = df_filtered['KNa'] - df_filtered['Na']
df_filtered.loc[na_mask & kna_present & k_present, 'Na'] = df_filtered['KNa'] - df_filtered['K']
df_filtered.loc[na_mask & kna_present & ~k_present, 'Na'] = df_filtered['KNa']

# Remove rows where 'Na' is still missing
df_filtered.dropna(subset=['Na'], inplace=True)

# Save the updated dataframe 
df_filtered.to_csv('../../data/df_output/df_filtered_Na.csv', index=False)

# Display the DataFrame
df_filtered.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered[col].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to 

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,BASIN,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,...,K,KNa,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
0,1,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,0.0,0.0,21.6,,9900.0,,,,100.0,Appalachian
1,2,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,0.0,0.0,37.0,,16700.0,,,,100.0,Appalachian
2,3,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,0.0,0.0,45.4,,18300.0,,,,100.0,Appalachian
3,4,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,0.0,0.0,52.0,,18500.0,,,,100.0,Appalachian
4,5,Shale,Shale Gas,Appalachian,Marcellus,96570.0,40.2354,-79.4704,Pennsylvania,Westmoreland,...,0.0,0.0,62.1,,22400.0,,,,100.0,Appalachian


In [11]:
# Filter out rows where USGS charge balance is not between -10 and +10
df_filtered = df_filtered[df_filtered['CHARGEBAL'].between(-15, 15)]

# Save the updated DataFrame to a CSV
df_filtered.to_csv('../../data/df_output/df_filtered_chargebalance.csv', index=False)

# Display the DataFrame 
df_filtered

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,BASIN,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,...,K,KNa,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
62,63,Shale,Shale Gas,Appalachian,Marcellus,54800.0,40.441658,-79.986932,Pennsylvania,Allegheny,...,0.0000,0.0,21.500,295.000,12000.0000,105.0,736.5,0.0839,-10.5,Appalachian
63,64,Shale,Shale Gas,Appalachian,Marcellus,26100.0,40.441658,-79.986932,Pennsylvania,Allegheny,...,0.0000,0.0,12.500,188.000,6920.0000,116.0,215.0,0.0321,-3.0,Appalachian
64,65,Shale,Shale Gas,Appalachian,Marcellus,41700.0,40.441658,-79.986932,Pennsylvania,Allegheny,...,0.0000,0.0,19.800,278.000,12700.0000,69.3,67.0,0.0657,-4.8,Appalachian
66,67,Shale,Shale Gas,Appalachian,Marcellus,38200.0,41.270892,-76.659691,Pennsylvania,Lycoming,...,53.4000,0.0,,164.000,11100.0000,,417.0,0.0570,-1.4,Appalachian
67,68,Shale,Shale Gas,Appalachian,Marcellus,82600.0,41.270892,-76.659691,Pennsylvania,Lycoming,...,59.6000,0.0,,367.000,19800.0000,,1.0,0.0100,-13.8,Appalachian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113127,115711,Shale,Tight Oil,Williston,Three Forks,328018.0,47.740170,-103.395280,North Dakota,McKenzie,...,8210.6430,0.0,76.351,1385.385,91108.1811,,,,-2.1,Williston
113128,115712,Shale,Tight Oil,Williston,Bakken,283801.0,47.740170,-103.395280,North Dakota,McKenzie,...,7350.4804,0.0,62.469,1263.860,77498.2787,,,,-3.1,Williston
113129,115713,Shale,Tight Oil,Williston,Three Forks,311627.0,47.740170,-103.395280,North Dakota,McKenzie,...,8718.9209,0.0,76.351,1288.165,84992.9209,,,,-2.3,Williston
113130,115714,Shale,Tight Oil,Williston,Three Forks,324452.0,47.740170,-103.395280,North Dakota,McKenzie,...,8914.4124,0.0,76.351,1312.470,88809.2111,,,,-2.4,Williston


### Finalize DataFrame by removing columns not needed for analysis

In [12]:
columns_to_drop = ['BASIN','KNa','FORMATION']
df_cleaned = df_filtered.drop(columns=[col for col in columns_to_drop if col in df_filtered.columns])

# Save the updated dataframe
df_cleaned.to_csv('../../data/df_output/df_cleaned.csv', index=False) 

# Display the DataFrame
df_cleaned.head()

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,PROVINCE,...,FeTot,K,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
62,63,Shale,Shale Gas,Marcellus,54800.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,27.4,0.0,21.5,295.0,12000.0,105.0,736.5,0.0839,-10.5,Appalachian
63,64,Shale,Shale Gas,Marcellus,26100.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,6.47,0.0,12.5,188.0,6920.0,116.0,215.0,0.0321,-3.0,Appalachian
64,65,Shale,Shale Gas,Marcellus,41700.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,43.5,0.0,19.8,278.0,12700.0,69.3,67.0,0.0657,-4.8,Appalachian
66,67,Shale,Shale Gas,Marcellus,38200.0,41.270892,-76.659691,Pennsylvania,Lycoming,Appalachian Basin,...,15.7,53.4,,164.0,11100.0,,417.0,0.057,-1.4,Appalachian
67,68,Shale,Shale Gas,Marcellus,82600.0,41.270892,-76.659691,Pennsylvania,Lycoming,Appalachian Basin,...,35.2,59.6,,367.0,19800.0,,1.0,0.01,-13.8,Appalachian


# PCA

In [13]:
# Create a directory for saving PCA results if it doesn't exist
output_dir = "../../data/pca_data"
os.makedirs(output_dir, exist_ok=True)

# Define columns to exclude from the PCA
exclude_columns = ['IDUSGS', 'PROVINCE', 'WELLNAME', 'CHARGEBAL', 'DEPTHLOWER', 'Distance_Cluster', 'API', 'DATESAMPLE']

# Loop through each basin
basins = df_cleaned['BASIN_CATEGORY'].unique()
pca_with_target_dfs = {}  # Dictionary to store DataFrames
for basin in basins:
    print(f"Processing PCA for {basin}")
    df_basin = df_cleaned[df_cleaned['BASIN_CATEGORY'] == basin]

    # Select numerical columns not in exclude_columns
    numerical_columns = [col for col in df_basin.select_dtypes(include=['float64', 'int64']).columns if col not in exclude_columns]
    df_numeric = df_basin[numerical_columns]

    # Impute missing values and standardize the data
    imputer = IterativeImputer(RandomForestRegressor(n_estimators=10), max_iter=10, random_state=42)
    df_imputed = imputer.fit_transform(df_numeric)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_imputed)

    # Perform PCA on the scaled data
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(scaled_data)

    # Prepare PCA DataFrame
    pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2', 'PC3'])
    pca_df['Li'] = df_basin['Li'].values 

    # Save PCA results with target variable as CSV files
    pca_filename = f"{output_dir}/{basin}_pca_with_target.csv"
    pca_df.to_csv(pca_filename, index=False)
    print(f"Saved PCA results with target for {basin} to {pca_filename}")

    # Save PCA results with target variable into a dictionary
    pca_with_target_df = pca_df.copy()  # Create a copy of pca_df
    pca_with_target_dfs[f"{basin}_pca_with_target"] = pca_with_target_df

Processing PCA for Appalachian




Saved PCA results with target for Appalachian to ../../data/pca_data/Appalachian_pca_with_target.csv
Processing PCA for Permian




Saved PCA results with target for Permian to ../../data/pca_data/Permian_pca_with_target.csv
Processing PCA for Oklahoma Platform




Saved PCA results with target for Oklahoma Platform to ../../data/pca_data/Oklahoma Platform_pca_with_target.csv
Processing PCA for Gulf Coast




Saved PCA results with target for Gulf Coast to ../../data/pca_data/Gulf Coast_pca_with_target.csv
Processing PCA for Williston




Saved PCA results with target for Williston to ../../data/pca_data/Williston_pca_with_target.csv
Processing PCA for Michigan




Saved PCA results with target for Michigan to ../../data/pca_data/Michigan_pca_with_target.csv
Processing PCA for Pacific




Saved PCA results with target for Pacific to ../../data/pca_data/Pacific_pca_with_target.csv
Processing PCA for Illinois




Saved PCA results with target for Illinois to ../../data/pca_data/Illinois_pca_with_target.csv
Processing PCA for Great Plains




Saved PCA results with target for Great Plains to ../../data/pca_data/Great Plains_pca_with_target.csv
Processing PCA for Anadarko




Saved PCA results with target for Anadarko to ../../data/pca_data/Anadarko_pca_with_target.csv
Processing PCA for Rocky Mountain




Saved PCA results with target for Rocky Mountain to ../../data/pca_data/Rocky Mountain_pca_with_target.csv
Processing PCA for Fort Worth
Saved PCA results with target for Fort Worth to ../../data/pca_data/Fort Worth_pca_with_target.csv




## Create DataFrames for each Basin's PCA data

In [33]:
# Display key names in the dictionary
key_names = list(pca_with_target_dfs.keys())
print(key_names)

['Appalachian_pca_with_target', 'Permian_pca_with_target', 'Oklahoma Platform_pca_with_target', 'Gulf Coast_pca_with_target', 'Williston_pca_with_target', 'Michigan_pca_with_target', 'Pacific_pca_with_target', 'Illinois_pca_with_target', 'Great Plains_pca_with_target', 'Anadarko_pca_with_target', 'Rocky Mountain_pca_with_target', 'Fort Worth_pca_with_target']


### Anadarko Basin

In [22]:
# Create a DataFrame for the Anadarko Basin PCA data
df_anadarko_pca = pd.DataFrame(pca_with_target_dfs['Anadarko_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_anadarko_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-3.031073,-0.919242,0.857407,
1,-2.784233,-1.01985,1.111889,
2,-3.024201,-1.183467,1.119996,
3,-2.607732,-1.085549,0.850049,
4,-2.927332,-1.110891,0.961687,


### Appalachian Basin

In [21]:
# Create a DataFrame for the Appalachian Basin PCA data
df_appalachian_pca = pd.DataFrame(pca_with_target_dfs['Appalachian_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_appalachian_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-3.130888,0.997418,-1.223275,21.5
1,-3.704683,0.72371,-1.31607,12.5
2,-3.298817,0.790451,-1.211501,19.8
3,-3.302563,2.065821,-1.488175,
4,-2.471762,2.316622,-1.582861,


### Fort Worth Basin

In [23]:
# Create a DataFrame for the Fort Worth Basin PCA data
df_fort_worth_pca = pd.DataFrame(pca_with_target_dfs['Fort Worth_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_fort_worth_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-3.494042,1.603082,-1.430565,
1,-2.971542,1.597787,-1.357049,
2,-3.65106,1.985281,-1.297116,
3,2.620299,-0.815989,0.43869,
4,3.188287,1.373141,2.149963,


### Great Plains Basin

In [24]:
# Create a DataFrame for the Great Plains Basin PCA data
df_great_plains_pca = pd.DataFrame(pca_with_target_dfs['Great Plains_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_great_plains_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-2.68124,1.932289,-0.273222,0.61
1,-2.623732,2.096298,-0.145451,0.69
2,-2.730942,1.777417,-0.53575,0.77
3,-2.674913,2.180635,-0.280931,0.7
4,-2.377429,1.5495,-0.857681,1.4


### Gulf Coast Basin

In [25]:
# Create a DataFrame for the Gulf Coast Basin PCA data
df_gulf_coast_pca = pd.DataFrame(pca_with_target_dfs['Gulf Coast_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_gulf_coast_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,2.683493,-1.277373,1.874346,124.0
1,1.332771,2.135511,2.38279,77.3
2,1.226876,0.457951,4.198607,
3,0.246275,-0.579993,1.767838,83.9
4,-0.493563,-0.537014,0.82143,


### Illinois Basin

In [26]:
# Create a DataFrame for the Illinois Basin PCA data
df_illinois_pca = pd.DataFrame(pca_with_target_dfs['Illinois_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_illinois_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-2.331741,-2.043458,-0.98294,
1,-3.903004,-0.893277,-0.244022,
2,-1.797172,-1.271039,-0.859839,
3,1.847093,-1.279558,0.592363,
4,2.444752,-1.363185,1.905534,


### Michigan Basin

In [27]:
# Create a DataFrame for the Michigan Basin PCA data
df_michigan_pca = pd.DataFrame(pca_with_target_dfs['Michigan_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_michigan_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,1.517517,-0.286431,0.075222,
1,1.785966,0.047047,0.788821,
2,0.824402,-0.892443,-0.305805,
3,-0.375777,-0.501321,-0.74333,
4,1.231979,-1.046587,-0.104611,


### Oklahoma Basin

In [28]:
# Create a DataFrame for the Oklahoma Basin PCA data
df_oklahoma_pca = pd.DataFrame(pca_with_target_dfs['Oklahoma Platform_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_oklahoma_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-3.117648,-3.706517,1.272853,4.731
1,-3.563654,-3.886214,1.332343,3.631
2,-0.056279,0.072014,-0.50919,8.0
3,1.550137,1.603319,1.616064,38.0
4,0.140688,0.570963,-0.422121,11.5


### Pacific Basin

In [29]:
# Create a DataFrame for the Pacific Basin PCA data
df_pacific_pca = pd.DataFrame(pca_with_target_dfs['Pacific_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_pacific_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-3.092727,-0.885066,-0.398682,0.7
1,-1.6866,-0.728733,0.940444,3.0
2,-2.728433,0.39063,-0.462391,1.9
3,-3.453219,0.246508,-1.389128,1.95
4,-2.415525,-1.073596,0.297915,1.3


### Permian Basin

In [30]:
# Create a DataFrame for the Permian Basin PCA data
df_permian_pca = pd.DataFrame(pca_with_target_dfs['Permian_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_permian_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-1.740555,-1.079363,0.168372,
1,0.487695,-0.998863,2.919911,
2,2.396892,-0.644012,3.193567,
3,-1.637282,-1.108659,0.043443,
4,-1.101983,-1.239238,1.963376,


### Rocky Mountain Basin

In [31]:
# Create a DataFrame for the Rocky Mountain Basin PCA data
df_rocky_mountain_pca = pd.DataFrame(pca_with_target_dfs['Rocky Mountain_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_rocky_mountain_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-0.159241,-0.245073,-0.496266,
1,1.155189,2.645887,1.051576,
2,1.255299,0.187423,-1.131378,
3,1.410585,0.601636,-1.125373,
4,2.105354,2.183059,0.209839,


### Williston Basin

In [32]:
# Create a DataFrame for the Williston Basin PCA data
df_williston_pca = pd.DataFrame(pca_with_target_dfs['Williston_pca_with_target'], columns=['PC1', 'PC2', 'PC3', 'Li'])

# Display the DataFrame
df_williston_pca.head()

Unnamed: 0,PC1,PC2,PC3,Li
0,-4.008167,0.828639,-0.589294,18.9
1,-3.810774,0.093926,-0.568553,4.66
2,-4.162017,0.571612,-0.875441,4.42
3,-1.142,-1.129218,-0.691561,17.0
4,-2.464991,-0.459396,-0.482297,15.6
