In [0]:
# Databricks Setup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sin, randn, expr
from datetime import datetime, timedelta
import os

# Storage Location in DBFS
DELTA_TABLE_PATH = "/mnt/oil_data/oil_yield_delta"

# Parameters
NUM_WELLS = 10
DAYS_TO_ADD = 30

# Initialize Spark
spark = SparkSession.builder.appName("OilYieldIngestion").getOrCreate()

# Generate Synthetic Data
def generate_synthetic_data(num_wells, start_date, days):
    wells = [f"WELL_{i+1:03d}" for i in range(num_wells)]
    dates = [start_date + timedelta(days=i) for i in range(days)]

    # Generate DataFrame
    data = []
    for well_id in wells:
        base_yield = np.random.uniform(100, 500)
        for date in dates:
            seasonal_variation = 20 * np.sin(2 * np.pi * (date.timetuple().tm_yday) / 365)
            noise = np.random.normal(0, 10)
            daily_yield = max(50, base_yield + seasonal_variation + noise)
            data.append((well_id, date, round(daily_yield, 2)))

    return spark.createDataFrame(data, ["well_id", "date", "yield_bbl"])

# Load Existing Data from Delta
if spark.catalog._jcatalog.tableExists("oil_forecasting.oil_yield"):
    existing_df = spark.read.format("delta").load(DELTA_TABLE_PATH)
    last_date = existing_df.selectExpr("max(date)").collect()[0][0]
    new_start_date = last_date + timedelta(days=1)
    print(f"Appending data from {new_start_date} onward...")
else:
    new_start_date = datetime(2023, 1, 1)

# Generate & Append New Data
new_df = generate_synthetic_data(NUM_WELLS, new_start_date, DAYS_TO_ADD)
new_df.write.format("delta").mode("append").save(DELTA_TABLE_PATH)

print(f"Added {new_df.count()} new records to Delta Lake.")