#Oil Extraction Production Forecasting
<br/>
<img src="https://www.nsenergybusiness.com/wp-content/uploads/sites/4/2022/07/refinery-ga56d4972f_640.jpg" />

In [0]:
#IMPORTANT! DO NOT CHANGE THESE VALUES!!!!
catalog = "workshop"
db = "default"
current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().get("user").get()

#IMPORTANT! THIS NEEDS TO BE UNIQUE FOR EVERY PARTICIPANT!!!!
#IMPORTANT! THIS NEEDS TO BE THE NAME OF THE TABLE YOU CREATED FOR THIS LAB!!!!
src_table = "ademianczuk_oil_yield"

In [0]:
import mlflow

#Set a named experiment. We want to use the same experiment where we logged our feature artifacts
mlflow.set_experiment(f"/Users/{current_user}/Oil Extraction Production Forecasting")

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient

fe = FeatureEngineeringClient()

#Read in our feature table with normalized & tranformed features for model training
df = fe.read_table(
  name=f'{catalog}.{db}.{src_table}_features_transformed'
)

Check skewness and kurtosis for the effect of our transformations

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

# Load transformed data
df_transformed = df.toPandas()

# Define function to print skewness & kurtosis
def check_distribution(df, feature):
    print(f"\nFeature: {feature}")
    print(f"  Skewness: {skew(df[feature]):.2f}")
    print(f"  Kurtosis: {kurtosis(df[feature]):.2f}")

# Compare original vs. transformed features
for feature in ["yield_bbl", "precipitation", "temperature"]:
    print("\n🔹 BEFORE Transformation:")
    check_distribution(df_transformed, feature)
    
    print("\n✅ AFTER Transformation:")
    check_distribution(df_transformed, f"{feature}_transformed")

# Plot distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

for i, feature in enumerate(["yield_bbl", "precipitation", "temperature"]):
    sns.histplot(df_transformed[feature], bins=30, kde=True, ax=axes[0, i], color="red")
    axes[0, i].set_title(f"Before Box-Cox: {feature}")

    sns.histplot(df_transformed[f"{feature}_transformed"], bins=30, kde=True, ax=axes[1, i], color="blue")
    axes[1, i].set_title(f"After Box-Cox: {feature}")

plt.tight_layout()
plt.show()

Next, we'll do a trial training run. All we're doing here is looking to see how the box-cox or yeo-johnson transforms affect the reliability of the trained model.

In [0]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

#Select features & target
features_original = ["temperature", "precipitation"]
features_transformed = ["temperature_transformed", "precipitation_transformed"]
target = "yield_bbl"

#Load datasets
df_transformed = df.toPandas()

#Train-test split
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(df_transformed[features_original], df_transformed[target], test_size=0.2, random_state=42)
X_train_trans, X_test_trans, y_train_trans, y_test_trans = train_test_split(df_transformed[features_transformed], df_transformed[target], test_size=0.2, random_state=42)

# Train XGBoost models
model_orig = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)
model_trans = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)

model_orig.fit(X_train_orig, y_train_orig)
model_trans.fit(X_train_trans, y_train_trans)

# Predictions
y_pred_orig = model_orig.predict(X_test_orig)
y_pred_trans = model_trans.predict(X_test_trans)

# Compute Errors
mae_orig = mean_absolute_error(y_test_orig, y_pred_orig)
rmse_orig = mean_squared_error(y_test_orig, y_pred_orig, squared=False)

mae_trans = mean_absolute_error(y_test_trans, y_pred_trans)
rmse_trans = mean_squared_error(y_test_trans, y_pred_trans, squared=False)

# Print Results
print("\n🔹 Model Performance (Without Box-Cox):")
print(f"  MAE: {mae_orig:.2f}, RMSE: {rmse_orig:.2f}")

print("\n✅ Model Performance (With Box-Cox):")
print(f"  MAE: {mae_trans:.2f}, RMSE: {rmse_trans:.2f}")

Lab Challenge: How could we further improve MAE and RMSE?
- Further adjustments to features?
- HP tuning?
- Yeo Johnson?
- What other algorithms might be better? LSTM for DNN processing?
- What's causing noise?

## Tuning and managing our experiment
MLFlow is key, and using hyperopt or optuna are good for distributed hyperparameter tuning. In the next notebook, we'll be setting up an MLFlow experiment for training and tuning out model.