#Oil Extraction Production Forecasting
<br/>
<img src="https://www.nsenergybusiness.com/wp-content/uploads/sites/4/2022/07/refinery-ga56d4972f_640.jpg" />

In [0]:
%pip install hyperopt
dbutils.library.restartPython()

In [0]:
#IMPORTANT! DO NOT CHANGE THESE VALUES!!!!
catalog = "workshop"
db = "default"
current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().get("user").get()

#IMPORTANT! THIS NEEDS TO BE UNIQUE FOR EVERY PARTICIPANT!!!!
#IMPORTANT! THIS NEEDS TO BE THE NAME OF THE TABLE YOU CREATED FOR THIS LAB!!!!
src_table = "ademianczuk_oil_yield"

In [0]:
import mlflow

# Set a named experiment
mlflow.set_experiment(f"/Users/{current_user}/Oil Extraction Production Forecasting")

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient

fe = FeatureEngineeringClient()

df = fe.read_table(
  name=f'{catalog}.{db}.{src_table}_features_transformed'
)

In [0]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load data
pdf = df.toPandas()
features = ["temperature", "precipitation_transformed"]
target = "yield_bbl"

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(pdf[features].values, pdf[target].values, test_size=0.2, random_state=42)

# Define Optuna objective function
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }
    model = xgb.XGBRegressor(objective="reg:squarederror", **params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

# Run optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Get best params
best_params = study.best_params
print("\n✅ Best Parameters Found:", best_params)

In [0]:
# import optuna
# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_absolute_error

# # Load data
# features = ["temperature", "precipitation"]
# target = "yield_bbl"

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# # Define Optuna objective function
# def objective(trial):
#     params = {
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
#     }
#     model = xgb.XGBRegressor(objective="reg:squarederror", **params)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return mean_absolute_error(y_test, y_pred)

# # Run optimization
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=50)

# # Get best params
# best_params = study.best_params
# print("\n✅ Best Parameters Found:", best_params)