In [1]:
# Import dependencies
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [2]:
# Show TensorFlow version
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.16.1


In [3]:
# Connect notebook to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Read CSV file from google drive into a DataFrame and display it
df=pd.read_csv("/content/drive/MyDrive/Project_4/df_cleaned.csv")
df

Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,PROVINCE,...,FeTot,K,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
0,63,Shale,Shale Gas,Marcellus,54800.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,27.40,0.0000,21.500,295.000,12000.0000,105.0,736.5,0.0839,-10.5,Appalachian
1,64,Shale,Shale Gas,Marcellus,26100.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,6.47,0.0000,12.500,188.000,6920.0000,116.0,215.0,0.0321,-3.0,Appalachian
2,65,Shale,Shale Gas,Marcellus,41700.0,40.441658,-79.986932,Pennsylvania,Allegheny,Appalachian Basin,...,43.50,0.0000,19.800,278.000,12700.0000,69.3,67.0,0.0657,-4.8,Appalachian
3,67,Shale,Shale Gas,Marcellus,38200.0,41.270892,-76.659691,Pennsylvania,Lycoming,Appalachian Basin,...,15.70,53.4000,,164.000,11100.0000,,417.0,0.0570,-1.4,Appalachian
4,68,Shale,Shale Gas,Marcellus,82600.0,41.270892,-76.659691,Pennsylvania,Lycoming,Appalachian Basin,...,35.20,59.6000,,367.000,19800.0000,,1.0,0.0100,-13.8,Appalachian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60653,115711,Shale,Tight Oil,Three Forks,328018.0,47.740170,-103.395280,North Dakota,McKenzie,Williston Basin,...,,8210.6430,76.351,1385.385,91108.1811,,,,-2.1,Williston
60654,115712,Shale,Tight Oil,Bakken,283801.0,47.740170,-103.395280,North Dakota,McKenzie,Williston Basin,...,,7350.4804,62.469,1263.860,77498.2787,,,,-3.1,Williston
60655,115713,Shale,Tight Oil,Three Forks,311627.0,47.740170,-103.395280,North Dakota,McKenzie,Williston Basin,...,,8718.9209,76.351,1288.165,84992.9209,,,,-2.3,Williston
60656,115714,Shale,Tight Oil,Three Forks,324452.0,47.740170,-103.395280,North Dakota,McKenzie,Williston Basin,...,,8914.4124,76.351,1312.470,88809.2111,,,,-2.4,Williston


In [5]:
# Give me a list of every unique value in the Basin_category column

unique_values = df['BASIN_CATEGORY'].unique().tolist()
print(unique_values)


['Appalachian', 'Permian', 'Oklahoma Platform', 'Gulf Coast', 'Williston', 'Michigan', 'Pacific', 'Illinois', 'Great Plains', 'Anadarko', 'Rocky Mountain', 'Fort Worth']


In [6]:
# Create a new dataframe named df_gulf that has only rows from df that have the value 'Gulf Coast' in BASIN_CATEGORY

df_gulf = df[df["BASIN_CATEGORY"] == "Gulf Coast"]
df_gulf


Unnamed: 0,IDUSGS,PLAYTYPE,WELLTYPE,FORMSIMPLE,TDS,LATITUDE,LONGITUDE,STATE,COUNTY,PROVINCE,...,FeTot,K,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL,BASIN_CATEGORY
224,1013,Sedimentary,Conventional Hydrocarbon,Edwards,171306.0,29.03000,-97.85000,Texas,Karnes,Western Gulf,...,,2410.00,124.0,1170.00,46800.00,,1802.000,,-4.1,Gulf Coast
225,1014,Sedimentary,Conventional Hydrocarbon,Edwards,109938.0,29.01000,-97.90000,Texas,Karnes,Western Gulf,...,,1510.00,77.3,805.00,29700.00,,1105.000,,-2.7,Gulf Coast
226,1016,Sedimentary,Conventional Hydrocarbon,Edwards,108400.0,28.63367,-97.98417,Texas,Bee,Western Gulf,...,,1271.00,,325.00,34000.00,,1552.000,,2.1,Gulf Coast
227,1017,Sedimentary,Conventional Hydrocarbon,Edwards,71265.0,28.59220,-98.03431,Texas,Bee,Western Gulf,...,,494.00,83.9,202.00,22900.00,,854.000,,0.4,Gulf Coast
228,1018,Sedimentary,Conventional Hydrocarbon,Edwards,71138.0,28.74400,-97.90900,Texas,Karnes,Western Gulf,...,,340.00,,360.00,19300.00,,868.000,,0.3,Gulf Coast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60342,115346,Sedimentary,Conventional Hydrocarbon,Wilcox,89456.0,31.59986,-92.15279,Louisiana,LaSalle,Louisiana-Mississippi Salt Basins,...,44.672,187.68,,422.82,29772.05,,245.336,,-7.3,Gulf Coast
60343,115347,Sedimentary,Conventional Hydrocarbon,Wilcox,108691.0,31.48436,-92.00514,Louisiana,Catahoula,Louisiana-Mississippi Salt Basins,...,50.256,218.96,,478.71,37128.85,,245.336,,-5.3,Gulf Coast
60344,115348,Sedimentary,Conventional Hydrocarbon,Wilcox,95434.0,31.49222,-92.00539,Louisiana,Catahoula,Louisiana-Mississippi Salt Basins,...,396.464,191.59,,391.23,30024.94,,192.764,,-11.4,Gulf Coast
60345,115350,Sedimentary,Conventional Hydrocarbon,Wilcox,108019.0,31.47478,-91.87311,Louisiana,Catahoula,Louisiana-Mississippi Salt Basins,...,5.584,512.21,,571.05,40002.60,,219.050,,1.2,Gulf Coast


In [7]:
# Show how many null values each feature has
df_gulf.isnull().sum()


IDUSGS                0
PLAYTYPE              0
WELLTYPE              0
FORMSIMPLE            0
TDS                   0
LATITUDE              0
LONGITUDE             0
STATE                 0
COUNTY              414
PROVINCE              0
REGION                0
WELLNAME            189
API                7326
DEPTHUPPER         2869
DEPTHLOWER         3728
PERIOD             5571
DATESAMPLE         1485
PH                 2051
B                 13698
Ba                11119
Br                13469
HCO3               1981
Ca                   32
Cl                    0
FeTot              9391
K                     0
Li                13657
Mg                  144
Na                    0
SO4                4096
Sr                12869
Zn                14228
CHARGEBAL             0
BASIN_CATEGORY        0
dtype: int64

In [8]:
# Drop all non numeric columns

df_gulf = df_gulf.select_dtypes(include=["number"])

# Print the updated DataFrame
df_gulf


Unnamed: 0,IDUSGS,TDS,LATITUDE,LONGITUDE,API,DEPTHUPPER,DEPTHLOWER,PH,B,Ba,...,Cl,FeTot,K,Li,Mg,Na,SO4,Sr,Zn,CHARGEBAL
224,1013,171306.0,29.03000,-97.85000,4.225500e+13,10906.0,10920.000000,,,25.200,...,107200.00,,2410.00,124.0,1170.00,46800.00,,1802.000,,-4.1
225,1014,109938.0,29.01000,-97.90000,4.225500e+13,10853.0,10853.000000,,128.000,25.200,...,68100.00,,1510.00,77.3,805.00,29700.00,,1105.000,,-2.7
226,1016,108400.0,28.63367,-97.98417,4.202530e+13,13624.0,14002.000000,,,,...,64400.00,,1271.00,,325.00,34000.00,,1552.000,,2.1
227,1017,71265.0,28.59220,-98.03431,4.202530e+13,13690.0,13778.000000,,,66.800,...,42800.00,,494.00,83.9,202.00,22900.00,,854.000,,0.4
228,1018,71138.0,28.74400,-97.90900,4.225530e+13,13033.0,13288.000000,,,,...,43500.00,,340.00,,360.00,19300.00,,868.000,,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60342,115346,89456.0,31.59986,-92.15279,1.705925e+13,,3976.049996,6.8,15.134,219.728,...,57358.10,44.672,187.68,,422.82,29772.05,,245.336,,-7.3
60343,115347,108691.0,31.48436,-92.00514,1.702521e+13,,4832.021152,6.7,16.215,164.796,...,68737.55,50.256,218.96,,478.71,37128.85,,245.336,,-5.3
60344,115348,95434.0,31.49222,-92.00539,1.702521e+13,,4795.931912,6.0,12.972,123.597,...,62817.40,396.464,191.59,,391.23,30024.94,,192.764,,-11.4
60345,115350,108019.0,31.47478,-91.87311,1.702522e+13,,4817.913540,5.2,16.215,123.597,...,65015.30,5.584,512.21,,571.05,40002.60,,219.050,,1.2


In [9]:
# Show how many null values each feature has
df_gulf.isnull().sum()

IDUSGS            0
TDS               0
LATITUDE          0
LONGITUDE         0
API            7326
DEPTHUPPER     2869
DEPTHLOWER     3728
PH             2051
B             13698
Ba            11119
Br            13469
HCO3           1981
Ca               32
Cl                0
FeTot          9391
K                 0
Li            13657
Mg              144
Na                0
SO4            4096
Sr            12869
Zn            14228
CHARGEBAL         0
dtype: int64

In [10]:
# Create a new dataframe named df_gulf_clean with only columns (PH, HC03, Ca, Cl, K, KNa, Li, Mh, Na, So4, CHARGEBAL) from df_gulf

df_gulf_clean = df_gulf[["PH", "HCO3", "Ca", "Cl", "K", "Li", "Mg", "Na", "SO4", "CHARGEBAL"]]
df_gulf_clean


Unnamed: 0,PH,HCO3,Ca,Cl,K,Li,Mg,Na,SO4,CHARGEBAL
224,,,11156.000,107200.00,2410.00,124.0,1170.00,46800.00,,-4.1
225,,,8080.000,68100.00,1510.00,77.3,805.00,29700.00,,-2.7
226,,,6510.000,64400.00,1271.00,,325.00,34000.00,,2.1
227,,,3479.000,42800.00,494.00,83.9,202.00,22900.00,,0.4
228,,,6770.000,43500.00,340.00,,360.00,19300.00,,0.3
...,...,...,...,...,...,...,...,...,...,...
60342,6.8,70.438,1110.216,57358.10,187.68,,422.82,29772.05,,-7.3
60343,6.7,33.080,1567.128,68737.55,218.96,,478.71,37128.85,,-5.3
60344,6.0,104.393,1258.512,62817.40,191.59,,391.23,30024.94,,-11.4
60345,5.2,,1474.944,65015.30,512.21,,571.05,40002.60,,1.2


In [11]:
# Show how many null values each feature has
df_gulf_clean.isnull().sum()

PH            2051
HCO3          1981
Ca              32
Cl               0
K                0
Li           13657
Mg             144
Na               0
SO4           4096
CHARGEBAL        0
dtype: int64

In [12]:
# Drop columns with any null values unless they are in column "Li"

df_gulf_clean.dropna(axis=0, how='any', subset=df_gulf_clean.columns.difference(['Li']), inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gulf_clean.dropna(axis=0, how='any', subset=df_gulf_clean.columns.difference(['Li']), inplace=True)


In [13]:
# Check data types
df_gulf_clean.dtypes

PH           float64
HCO3         float64
Ca           float64
Cl           float64
K            float64
Li           float64
Mg           float64
Na           float64
SO4          float64
CHARGEBAL    float64
dtype: object

In [14]:
# Fill in remaining null values with 0's
df_gulf_clean.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gulf_clean.fillna(0, inplace=True)


In [15]:
# Separate faetures and variable
X = df_gulf_clean.drop(columns=['Li'])
y = df_gulf_clean['Li']

In [16]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [17]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Building the neural network model
nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation= 'relu')  # Output layer
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
# Compiling the model
nn_model.compile(optimizer='adam', loss='mean_squared_error', metrics=["accuracy"] )

In [20]:
# Check the structure of the Sequential model
nn_model.summary()

In [21]:
# Run 50 epochs on the model
history = nn_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9744 - loss: 300.8589 - val_accuracy: 0.9718 - val_loss: 2935.9033
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9694 - loss: 517.7240 - val_accuracy: 0.9704 - val_loss: 2935.9038
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9702 - loss: 739.0480 - val_accuracy: 0.8577 - val_loss: 2907.8005
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8369 - loss: 656.8790 - val_accuracy: 0.9394 - val_loss: 2732.8071
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8988 - loss: 796.3411 - val_accuracy: 0.9056 - val_loss: 2673.2610
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9288 - loss: 458.3424 - val_accuracy: 0.9493 - val_loss: 2677.726

In [22]:
# Evaluating the model
loss = nn_model.evaluate(X_test_scaled, y_test)
print(f'Test Loss: {loss}')

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9678 - loss: 57.4454
Test Loss: [49.077640533447266, 0.9695431590080261]
