Run this script first to generate the `.csv` files required for the repository. This creates realistic "noisy" data where the sum of wells doesn't perfectly match the fiscal meter.

#### 1. Generate Well Test Data (Sparse: Monthly)
Logic: Each well has a base decline curve + random noise

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Configuration
start_date = datetime(2014, 1, 1)
days = 365 * 10  # 10 Years
dates = [start_date + timedelta(days=x) for x in range(days)]
n_wells = 10
zones = ['Zone_A', 'Zone_B', 'Zone_C', 'Zone_D']

# 1. Generate Well Test Data (Sparse: Monthly)
# Logic: Each well has a base decline curve + random noise
test_records = []
well_ids = [f'Well_{i+1:02d}' for i in range(n_wells)]

for well in well_ids:
    # Initial rate between 100 and 1000 bopd
    initial_rate = np.random.uniform(100, 1000)
    decline_rate = np.random.uniform(0.0001, 0.0005) # Daily decline
    
    current_date = start_date
    while current_date < dates[-1]:
        # Random test date offset (tests don't happen on the 1st of month)
        test_date = current_date + timedelta(days=np.random.randint(0, 25))
        if test_date > dates[-1]: break
        
        # Exponential decline
        days_passed = (test_date - start_date).days
        rate = initial_rate * np.exp(-decline_rate * days_passed)
        # Add noise/fluctuation
        measured_rate = rate * np.random.normal(1.0, 0.05) 
        
        test_records.append({
            'Date': test_date,
            'Well_ID': well,
            'Test_Rate_BOPD': round(max(0, measured_rate), 2)
        })
        # Move to next month
        current_date += timedelta(days=30)

df_tests = pd.DataFrame(test_records)
df_tests.to_csv('well_tests.csv', index=False)

#### 2. Generate Fiscal Meter Data (Daily Commingled)
Logic: Sum of interpolated wells * random "System Correction Factor" (0.9 to 1.1)
First, let's create a temporary dense daily frame to sum up theoreticals

In [3]:
# 2. Generate Fiscal Meter Data (Daily Commingled)
# Logic: Sum of interpolated wells * random "System Correction Factor" (0.9 to 1.1)
# First, let's create a temporary dense daily frame to sum up theoreticals
df_daily_temp = pd.DataFrame({'Date': dates})
for well in well_ids:
    # Get tests for this well
    w_tests = df_tests[df_tests['Well_ID'] == well].sort_values('Date')
    # Merge and interpolate
    df_temp = pd.merge(df_daily_temp, w_tests, on='Date', how='left')
    df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
    df_daily_temp[well] = df_temp['Test_Rate_BOPD']

# Calculate theoretical sum
df_daily_temp['Theoretical_Total'] = df_daily_temp[well_ids].sum(axis=1)

# Generate Fiscal Data (with noise/shrinkage/meter error)
df_daily_temp['Fiscal_Meter_BOPD'] = df_daily_temp['Theoretical_Total'] * np.random.normal(0.98, 0.04, len(df_daily_temp))
df_fiscal = df_daily_temp[['Date', 'Fiscal_Meter_BOPD']].round(2)
df_fiscal.to_csv('production_daily.csv', index=False)

  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] = df_temp['Test_Rate_BOPD'].interpolate(method='linear').fillna(method='bfill')
  df_temp['Test_Rate_BOPD'] 

#### 3. Generate Zone Splits (Static for simplicity, could be time-variant)

In [4]:
# 3. Generate Zone Splits (Static for simplicity, could be time-variant)
zone_data = []
for well in well_ids:
    # Random split summing to 1.0
    splits = np.random.dirichlet(np.ones(4), size=1)[0]
    zone_data.append({
        'Well_ID': well,
        'Zone_A': round(splits[0], 4),
        'Zone_B': round(splits[1], 4),
        'Zone_C': round(splits[2], 4),
        'Zone_D': round(splits[3], 4)
    })
df_zones = pd.DataFrame(zone_data)
df_zones.to_csv('well_zone_splits.csv', index=False)

print("Data generation complete.")

Data generation complete.
