#Oil Extraction Production Forecasting
<br/>
<img src="https://www.nsenergybusiness.com/wp-content/uploads/sites/4/2022/07/refinery-ga56d4972f_640.jpg" />

## Synthetic Data Generation

This script generates a synthetic dataset and stores the result in a parquet file / structure. This gives us the flexibility to take a variety of approaches in terms of ingesting the data. Two options may be:
1. Reading the newly landed parquet data using autoloader / cloudfiles
1. Converting the directory to a delta format for easy delta table registration.

Working with CSV data is very similar in terms of updating and appending data.

In [0]:
import pandas as pd
import numpy as np
import datetime
import os
import glob
import pyarrow.parquet as pq
import pyarrow as pa

# Configuration
NUM_WELLS = 100  # Number of well sites
DAYS_TO_ADD = 3650  # Number of new days to simulate per run
OUTPUT_DIR = "../data/synthetic_oil_yield_parquet_files"  # Directory to store multiple files

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_latest_end_date():
    """Finds the latest recorded date for each well from the most recent Parquet file."""
    parquet_files = sorted(glob.glob(os.path.join(OUTPUT_DIR, "synthetic_oil_yield_*.parquet")), reverse=True)
    
    if parquet_files:
        latest_file = parquet_files[0]  # Most recent file
        df = pq.read_table(latest_file).to_pandas()
        
        # Get the last recorded date per well
        last_dates = df.groupby("well_id")["date"].max().to_dict()
        print(f"Loaded last recorded dates from {latest_file}")
        
        return last_dates
    else:
        return None  # No previous data found

def generate_synthetic_data(num_wells, last_dates, days):
    """Generates new synthetic data starting from the next day after the last recorded date for each well."""
    np.random.seed(None)  # Ensures randomness per execution

    well_ids = [f"WELL_{i+1:03d}" for i in range(num_wells)]
    records = []

    for well_id in well_ids:
        start_date = last_dates.get(well_id, datetime.datetime(2015, 1, 1)) + datetime.timedelta(days=1)

        base_yield = np.random.uniform(100, 500)  # Base yield per well
        base_pressure = np.random.uniform(2500, 3500)  # Well pressure in psi

        for day in range(days):
            date = start_date + datetime.timedelta(days=day)

            # Time-based seasonality
            day_of_year = date.timetuple().tm_yday

            # Environmental Factors
            temperature = round(10 + 15 * np.sin(2 * np.pi * day_of_year / 365) + np.random.normal(0, 3), 1)
            precipitation = max(0, np.random.normal(5, 10) if day_of_year in range(90, 180) else np.random.normal(2, 5))
            humidity = round(np.random.uniform(30, 90), 1)
            wind_speed = round(np.random.uniform(5, 25), 1)

            # Operational Factors
            well_pressure = max(2000, base_pressure + np.random.normal(0, 50))
            downtime = np.random.choice([0, 0, 0, 2, 4, 8, 12], p=[0.8, 0.05, 0.05, 0.02, 0.03, 0.025, 0.025])
            sand_quality = round(np.random.uniform(75, 95), 1)
            drilling_efficiency = round(np.random.uniform(80, 98), 1)

            # External Factors
            oil_price = round(np.random.uniform(50, 120), 2)
            regulatory_impact = np.random.choice([0, 1], p=[0.95, 0.05])

            # Yield Calculation
            seasonal_variation = 20 * np.sin(2 * np.pi * day_of_year / 365)
            noise = np.random.normal(0, 10)
            well_downtime_impact = 1 - (downtime / 24)
            quality_impact = (sand_quality / 100)
            efficiency_impact = (drilling_efficiency / 100)

            # Final Yield Calculation
            daily_yield = max(50, base_yield * seasonal_variation * well_downtime_impact * quality_impact * efficiency_impact + noise)

            # Append record
            records.append((well_id, date, round(daily_yield, 2), temperature, precipitation, humidity, wind_speed,
                            well_pressure, downtime, sand_quality, drilling_efficiency, oil_price, regulatory_impact))

    # Create DataFrame
    df = pd.DataFrame(records, columns=["well_id", "date", "yield_bbl", "temperature", "precipitation", "humidity", 
                                        "wind_speed", "well_pressure", "downtime", "sand_quality", 
                                        "drilling_efficiency", "oil_price", "regulatory_impact"])

    return df

# Get last recorded date per well (if available)
last_dates = get_latest_end_date() or {}

# Generate new data starting from the next available date
new_data = generate_synthetic_data(NUM_WELLS, last_dates, DAYS_TO_ADD)

# Define filename with timestamp
current_run_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(OUTPUT_DIR, f"synthetic_oil_yield_{current_run_timestamp}.parquet")

# Write to a new Parquet file
table = pa.Table.from_pandas(new_data)
pq.write_table(table, output_file)

print(f"Generated {len(new_data)} new records and saved to {output_file}")

In [0]:
new_data

In [0]:
current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
print (current_user)