## Import dependencies

In [16]:
import pandas as pd
import os
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

## Read in the data from the preprocessed csv

In [17]:
# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath(""))

# Navigate up two levels to reach the Project4 directory
project_dir = os.path.dirname(current_dir)

# Construct the absolute path to the CSV file
file_path = os.path.join(project_dir, "data/df_output/df_cleaned.csv")

# Load the CSV file
df_data = pd.read_csv(file_path)

# Show the df
df_data.head(2)

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,PROVINCE,...,FeTot,K,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
0,63,Shale,Shale Gas,Marcellus,54800.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,27.4,0.0,21.5,295.0,12000.0,105.0,736.5,0.0839,-10.5,Appalachian
1,64,Shale,Shale Gas,Marcellus,26100.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,6.47,0.0,12.5,188.0,6920.0,116.0,215.0,0.0321,-3.0,Appalachian


In [18]:
df_data.columns

Index(['IDUSGS', 'PLAYTYPE', 'WELLTYPE', 'FORMSIMPLE', 'TDS', 'LATITUDE',
       'LONGITUDE', 'STATE', 'COUNTY', 'PROVINCE', 'REGION', 'WELLNAME', 'API',
       'DEPTHUPPER', 'DEPTHLOWER', 'PERIOD', 'DATESAMPLE', 'PH', 'B', 'Ba',
       'Br', 'HCO3', 'Ca', 'Cl', 'FeTot', 'K', 'Li', 'Mg', 'Na', 'SO4', 'Sr',
       'Zn', 'CHARGEBAL', 'BASIN_CATEGORY'],
      dtype='object')

In [19]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60658 entries, 0 to 60657
Data columns (total 34 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   IDUSGS          60658 non-null  int64  
 1   PLAYTYPE        60658 non-null  object 
 2   WELLTYPE        60658 non-null  object 
 3   FORMSIMPLE      60658 non-null  object 
 4   TDS             60658 non-null  float64
 5   LATITUDE        60658 non-null  float64
 6   LONGITUDE       60658 non-null  float64
 7   STATE           60658 non-null  object 
 8   COUNTY          60026 non-null  object 
 9   PROVINCE        60658 non-null  object 
 10  REGION          60658 non-null  object 
 11  WELLNAME        54693 non-null  object 
 12  API             34447 non-null  float64
 13  DEPTHUPPER      47354 non-null  float64
 14  DEPTHLOWER      40757 non-null  float64
 15  PERIOD          48402 non-null  object 
 16  DATESAMPLE      46382 non-null  object 
 17  PH              45174 non-null 

## Generate KNN regression models for each basin for consideration

### ROUND 0 | Running on full data ILO by basin first to establish some basic logic as a starting point

In [20]:
# Drop unnecessary columns
# columns_to_drop = ['IDUSGS', 'PERIOD', 'DATESAMPLE', 'API']
columns_to_drop = ['IDUSGS', 'PERIOD', 'DATESAMPLE', 'API', 'Ba', 'Zn', 'B', 'FeTot', 'DEPTHLOWER', 'Sr', 'Br', 'HCO3']
df = df_data.drop(columns_to_drop, axis=1)
df.head(2)

Unnamed: 0,PLAYTYPE,WELLTYPE,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,PROVINCE,REGION,...,PH,Ca,Cl,K,Li,Mg,Na,SO4,CHARGEBAL,BASIN_CATEGORY
0,Shale,Shale Gas,Marcellus,54800.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,Eastern,...,,3140.0,31300.0,0.0,21.5,295.0,12000.0,105.0,-10.5,Appalachian
1,Shale,Shale Gas,Marcellus,26100.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,Eastern,...,,1790.0,15300.0,0.0,12.5,188.0,6920.0,116.0,-3.0,Appalachian


In [21]:
df.dropna(subset=['Li'], inplace=True)
df.head(2)

Unnamed: 0,PLAYTYPE,WELLTYPE,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,PROVINCE,REGION,...,PH,Ca,Cl,K,Li,Mg,Na,SO4,CHARGEBAL,BASIN_CATEGORY
0,Shale,Shale Gas,Marcellus,54800.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,Eastern,...,,3140.0,31300.0,0.0,21.5,295.0,12000.0,105.0,-10.5,Appalachian
1,Shale,Shale Gas,Marcellus,26100.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,Eastern,...,,1790.0,15300.0,0.0,12.5,188.0,6920.0,116.0,-3.0,Appalachian


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4131 entries, 0 to 60657
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PLAYTYPE        4131 non-null   object 
 1   WELLTYPE        4131 non-null   object 
 2   FORMSIMPLE      4131 non-null   object 
 3   TDS             4131 non-null   float64
 4   LATITUDE        4131 non-null   float64
 5   LONGITUDE       4131 non-null   float64
 6   STATE           4131 non-null   object 
 7   COUNTY          4011 non-null   object 
 8   PROVINCE        4131 non-null   object 
 9   REGION          4131 non-null   object 
 10  WELLNAME        3270 non-null   object 
 11  DEPTHUPPER      3293 non-null   float64
 12  PH              3123 non-null   float64
 13  Ca              4118 non-null   float64
 14  Cl              4131 non-null   float64
 15  K               4131 non-null   float64
 16  Li              4131 non-null   float64
 17  Mg              4093 non-null   float

In [23]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2142 entries, 285 to 59982
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PLAYTYPE        2142 non-null   object 
 1   WELLTYPE        2142 non-null   object 
 2   FORMSIMPLE      2142 non-null   object 
 3   TDS             2142 non-null   float64
 4   LATITUDE        2142 non-null   float64
 5   LONGITUDE       2142 non-null   float64
 6   STATE           2142 non-null   object 
 7   COUNTY          2142 non-null   object 
 8   PROVINCE        2142 non-null   object 
 9   REGION          2142 non-null   object 
 10  WELLNAME        2142 non-null   object 
 11  DEPTHUPPER      2142 non-null   float64
 12  PH              2142 non-null   float64
 13  Ca              2142 non-null   float64
 14  Cl              2142 non-null   float64
 15  K               2142 non-null   float64
 16  Li              2142 non-null   float64
 17  Mg              2142 non-null   flo

In [24]:
# One-hot encode categorical variables
categorical_columns = ['PLAYTYPE', 'WELLTYPE', 'FORMSIMPLE', 'STATE', 'COUNTY', 'PROVINCE', 'REGION', 'WELLNAME', 'BASIN_CATEGORY']
df = pd.get_dummies(df, columns=categorical_columns, dtype=int)
df.head(2)

Unnamed: 0,TDS,LATITUDE,LONGITUDE,DEPTHUPPER,PH,Ca,Cl,K,Li,Mg,...,BASIN_CATEGORY_Fort Worth,BASIN_CATEGORY_Great Plains,BASIN_CATEGORY_Gulf Coast,BASIN_CATEGORY_Illinois,BASIN_CATEGORY_Michigan,BASIN_CATEGORY_Oklahoma Platform,BASIN_CATEGORY_Pacific,BASIN_CATEGORY_Permian,BASIN_CATEGORY_Rocky Mountain,BASIN_CATEGORY_Williston
285,25200.0,29.55707,-97.7893,2582.0,6.5,1900.0,14000.0,130.0,7.7,580.0,...,0,0,1,0,0,0,0,0,0,0
286,127000.0,29.17289,-97.54256,11933.0,6.1,13000.0,76000.0,1100.0,120.0,1100.0,...,0,0,1,0,0,0,0,0,0,0


In [25]:
# Splitting the data into features (X) and the target variable (y)
y = df['Li'].copy()                   # Target variable
X = df.drop(columns=['Li']).copy()    # Features

In [26]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
# Define a range of k values to try
k_values = list(range(1, 21))

# Create a parameter grid
param_grid = {'n_neighbors': k_values}

# Perform grid search with cross-validation
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best k value
best_k = grid_search.best_params_['n_neighbors']
print("Best k value:", best_k)

Best k value: 2


In [29]:
# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("MSE of the best KNN model:", mse)

MSE of the best KNN model: 14952.269930870916


In [30]:
# Calculate the variance of the target variable
target_variance = np.var(y_test)

# Print the variance and MSE
print("Variance of the target variable:", target_variance)
print("MSE of the model:", mse)

# Compare the MSE to the variance
if mse < target_variance:
    print("The model's MSE is smaller than the variance of the target variable.")
    print("The model is capturing a significant portion of the variability in the target variable.")
else:
    print("The model's MSE is close to or larger than the variance of the target variable.")
    print("The model's performance may be limited compared to the variability of the target variable.")


Variance of the target variable: 14142.737286612884
MSE of the model: 14952.269930870916
The model's MSE is close to or larger than the variance of the target variable.
The model's performance may be limited compared to the variability of the target variable.


### Establishing function for running models for each Basin given certain variable inputs

In [None]:
def generate_knn_models_by_basin(df, round, basins, columns_to_drop, categorical_columns,test_size, random_state k_values):
    
    for basin in basins:
        
        '''Finalize data preprocessing per input variables / requirements of KNN regression model'''

        #...

        '''--------------------------------------------------------------------------------------------------------------'''
        '''............'''

        #...

        '''--------------------------------------------------------------------------------------------------------------'''
        '''............'''

        #...
        
        '''--------------------------------------------------------------------------------------------------------------'''
        '''Train model'''

        #...

        '''--------------------------------------------------------------------------------------------------------------'''
        '''Test model'''

        #...

        '''--------------------------------------------------------------------------------------------------------------'''
        '''Add results to a dataframe'''

        columns = ['round', 'basin', 'test_train_data_row_count','feature_quantity', 'feature_list', 'target_variable_variance', 'model_mse', 'delta_mse_vs_variance']

        df_results = pd.DataFrame(
            #...
        ).sort_values(by=['delta_mse_vs_variance'], ascending=True, inplace=True)

    return df_results

### ROUND 1 | Let the iterating begin...

In [87]:
# Define all the variables...
df = df_data.copy()
round = '01'
basins = df_data['BASIN_CATEGORY'].unique()
columns_to_drop = ['IDUSGS', 'PERIOD', 'DATESAMPLE', 'API', 'Ba', 'Zn', 'B', 'FeTot', 'DEPTHLOWER', 'Sr', 'Br', 'HCO3']
categorical_columns = ['PLAYTYPE', 'WELLTYPE', 'FORMSIMPLE', 'STATE', 'COUNTY', 'PROVINCE', 'REGION', 'WELLNAME', 'BASIN_CATEGORY']
test_size = 0.3
random_state = 42
k_values = list(range(1, 21))

# Run the function...
df_round01 = generate_knn_models_by_basin(df=df, round=round, basins=basins, columns_to_drop=columns_to_drop, categorical_columns=categorical_columns, 
                                          test_size=test_size, random_state=random_state, k_values=k_values)
df_round01

array(['Appalachian', 'Permian', 'Oklahoma Platform', 'Gulf Coast',
       'Williston', 'Michigan', 'Pacific', 'Illinois', 'Great Plains',
       'Anadarko', 'Rocky Mountain', 'Fort Worth'], dtype=object)

### Merge all results dfs into 1 large df for evaluation

In [None]:
rounds = [df_round01, '''...''']
columns = ['round', 'basin', 'test_train_data_row_count','feature_quantity', 'feature_list', 'target_variable_variance', 'model_mse', 'delta_mse_vs_variance']

for round in rounds:
    df_results = #...
    

In [None]:
# Identify best round # for each basin
#...