In [1]:
import pandas as pd
import numpy as np

# Set a seed for reproducibility
np.random.seed(42)

# 1. Define the number of samples
n_samples = 500

# 2. Generate Random Features (Inputs)
# Temperature: Randomly distributed between 100F and 250F
temps = np.random.uniform(100, 250, n_samples)

# Pressure: Randomly distributed between 2000 and 4500 psi
pressures = np.random.uniform(2000, 4500, n_samples)

# API Gravity: Randomly distributed between 15 (Heavy) and 45 (Light)
apis = np.random.uniform(15, 45, n_samples)

# 3. Calculate Target Variable (Viscosity) based on Physics (Beggs-Robinson style)
# We use a simplified correlation to create the "True" viscosity trend
# Physics: Viscosity drops as Temp increases (Log-log relationship)
# Physics: Viscosity drops as API increases
# Physics: Pressure has a minor increasing effect on viscosity (above Bubble Point)

# Step A: Base Viscosity from API and Temp (Dead Oil)
# Formula structure: Mu = 10^(10^(A - B*API) * T^(-1.163)) - 1
x = (10**(3.0324 - 0.02023 * apis)) * (temps ** -1.163)
base_viscosity = 10**x - 1

# Step B: Pressure Correction (Minor effect: Viscosity increases slightly as P increases)
# (Simplified linear multiplier for valid physical range)
pressure_factor = 1 + (0.00001 * (pressures - 2000))
true_viscosity = base_viscosity * pressure_factor

# 4. Add "Real World" Noise
# Lab data is never perfect. We add 10% random Gaussian noise to simulate sensor error.
noise = np.random.normal(0, 0.1 * true_viscosity, n_samples)
final_viscosity = true_viscosity + noise

# Ensure no negative viscosity values (physics constraint)
final_viscosity = np.maximum(final_viscosity, 0.5)

# 5. Create DataFrame
df_synthetic = pd.DataFrame({
    'Well_ID': [f'W-{i:03d}' for i in range(1, n_samples + 1)],
    'Temperature_F': np.round(temps, 1),
    'Pressure_Psi': np.round(pressures, 0).astype(int),
    'API_Gravity': np.round(apis, 1),
    'Viscosity_cP': np.round(final_viscosity, 2)
})

# 6. Save to CSV
df_synthetic.to_csv('viscosity_data.csv', index=False)

print(f"Successfully generated 'viscosity_data.csv' with {len(df_synthetic)} rows.")
print(df_synthetic.head())

Successfully generated 'viscosity_data.csv' with 500 rows.
  Well_ID  Temperature_F  Pressure_Psi  API_Gravity  Viscosity_cP
0   W-001          156.2          3745         20.6         10.33
1   W-002          242.6          3340         31.3          2.05
2   W-003          209.8          2774         41.2          0.93
3   W-004          189.8          4034         37.0          1.45
4   W-005          123.4          3712         39.2          3.79
