#Oil Extraction Production Forecasting
<br/>
<img src="https://www.nsenergybusiness.com/wp-content/uploads/sites/4/2022/07/refinery-ga56d4972f_640.jpg" />

In [0]:
%pip install prophet
dbutils.library.restartPython()

In [0]:
import hashlib, base64

#IMPORTANT! DO NOT CHANGE THESE VALUES!!!!
catalog = "workshop"
db = "default"
current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().get("user").get()
hash_object = hashlib.sha256(current_user.encode())
hash_user_id = base64.b32encode(hash_object.digest()).decode("utf-8").rstrip("=")[:12]  #Trim to 12 chars for readability
initials = "".join([x[0] for x in current_user.split("@")[0].split(".")])
short_hash = hashlib.md5(current_user.encode()).hexdigest()[:8]  #Short 8-char hash
safe_user_id = f"{initials.upper()}_{short_hash}"
src_table = f"{safe_user_id}_oil_yield"
model_name = f"{safe_user_id}_oil_yield_forecast"
model_uri = f"{catalog}.{db}.{model_name}"

In [0]:
import mlflow

# Set a named experiment
mlflow.set_experiment(f"/Users/{current_user}/Oil Extraction Production Forecasting")

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient

fe = FeatureEngineeringClient()

df = fe.read_table(
  name=f'{catalog}.{db}.{src_table}_features_transformed'
)

In [0]:
#If we want to use the UC registry rather than the local mlflow registry, set databricks-uc as the registry uri
mlflow.set_registry_uri("databricks-uc")

In [0]:
import mlflow
from mlflow import MlflowClient

# Define Unity Catalog Model URI with alias
model_alias = "Champion"
model_uri = f"models:/{catalog}.{db}.{model_name}@{model_alias}"

# Load the trained model
loaded_model = mlflow.xgboost.load_model(model_uri)

print(f"✅ Model Loaded from Unity Catalog. Loaded {model_name}")

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.sql.functions import col, date_add
import pandas as pd

fe = FeatureEngineeringClient()

df = fe.read_table(
  name=f'{catalog}.{db}.{src_table}_features'
).orderBy(col("date").desc()).limit(30).toPandas()

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.sql.functions import col, date_add
import pandas as pd

fe = FeatureEngineeringClient()

df_latest_features = fe.read_table(
  name=f'{catalog}.{db}.{src_table}_features_transformed'
).orderBy(col("date").desc()).limit(30).toPandas()

#Generate future dates
future_dates = pd.date_range(start=df_latest_features["date"].max(), periods=30, freq="D")

#Estimate future temperature & precipitation based on past seasonality
df_future_features = df_latest_features.copy()
df_future_features["date"] = future_dates
df_future_features["temperature"] = df_latest_features["temperature"].mean()  #Replace with seasonal estimate
df_future_features["precipitation_transformed"] = df_latest_features["precipitation_transformed"].mean()  #Replace with seasonal estimate

print("✅ Generated Future Feature Data")
print(df_future_features.head())

In [0]:
df_future_features

In [0]:
# Select input features for prediction
X_future = df_future_features[["temperature", "precipitation_transformed"]]

# Run predictions
df_future_features["predicted_yield"] = loaded_model.predict(X_future)

# Display results
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.plot(df_future_features["date"], df_future_features["predicted_yield"], marker="o", linestyle="dashed", color="red")
plt.xlabel("Date")
plt.ylabel("Predicted Yield (BBL)")
plt.title("Predicted Oil Yield for Next 30 Days")
plt.xticks(rotation=45)
plt.grid()
plt.show()

print("✅ Predictions Complete")
print(df_future_features.head())

In [0]:
# Convert Pandas DataFrame to Spark DataFrame
df_future_spark = spark.createDataFrame(df_future_features)

# Save predictions to a Delta Table
df_future_spark.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{db}.{src_table}_predictions")

print("✅ Saved Predictions to Unity Catalog Feature Store")

meh. we can do better.

In [0]:
from prophet import Prophet
import pandas as pd

#Load historical feature data (this is the same data we transformed in notebook 02_Advanced_Feature_Engineering)
df_features = spark.read.table(f"{catalog}.{db}.{src_table}_features_transformed").toPandas()

#Ensure date format. Dates aren't always parsed properly.
df_features["date"] = pd.to_datetime(df_features["date"])

#Prepare data for Prophet. Set our target (y) and our date/time series (ds)
temp_df = df_features[["date", "temperature"]].rename(columns={"date": "ds", "temperature": "y"})
precip_df = df_features[["date", "precipitation_transformed"]].rename(columns={"date": "ds", "precipitation_transformed": "y"})

#Train Prophet models for temperature & precipitation
temp_model = Prophet()
temp_model.fit(temp_df)

precip_model = Prophet()
precip_model.fit(precip_df)

#Forecast next 30 days
future_dates = temp_model.make_future_dataframe(periods=30)
temp_forecast = temp_model.predict(future_dates)
precip_forecast = precip_model.predict(future_dates)

#Extract predictions
df_predicted_env = future_dates.copy()
df_predicted_env["temperature"] = temp_forecast["yhat"]
df_predicted_env["precipitation_transformed"] = precip_forecast["yhat"]

print("✅ Forecasted Temperature & Precipitation for Next 30 Days")
print(df_predicted_env.head())

In [0]:
import mlflow

#Select input features for prediction
X_future = df_predicted_env[["temperature", "precipitation_transformed"]]

#Run predictions
df_predicted_env["predicted_yield"] = loaded_model.predict(X_future)

print("✅ Oil Yield Predictions for Next 30 Days")
print(df_predicted_env.head())

In [0]:
predict_df = df_predicted_env.sort_values(by='ds', ascending=False).head(30)

In [0]:
predict_df

In [0]:
import matplotlib.pyplot as plt

#Plot results
plt.figure(figsize=(12, 5))

plt.plot(predict_df["ds"], predict_df["predicted_yield"], marker="o", linestyle="dashed", color="red")
plt.xlabel("Date")
plt.ylabel("Predicted Yield (BBL)")
plt.title("Predicted Oil Yield for Next 30 Days (Using Forecasted Features)")
plt.xticks(rotation=45)
plt.grid()
plt.show()

In [0]:
# Convert Pandas DataFrame to Spark DataFrame
df_future_spark = spark.createDataFrame(df_predicted_env.rename(columns={"ds": "date"}))

# Save predictions to a Delta Table in Unity Catalog
df_future_spark.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{db}.{src_table}_predictions")

print("✅ Saved Forecasted Oil Yield to Unity Catalog Feature Store")

Lab Challenge: What would be the best way to carry these predictions going forward?
- How often should we re-run the prediction?
- How should we be treating and updating the predicted v. actual data?
- Would we want online or offline inference?
- How would model serving benefit us?