In [1]:
import pandas as pd
import numpy as np

def generate_synthetic_logs(n_rows=25000):
    np.random.seed(42)
    
    # 1. Depth (ft) - 0.5 ft sampling rate
    depth = np.arange(1000, 1000 + (n_rows * 0.5), 0.5)
    
    # 2. Lithology Simulation (Alternating Sand/Shale sequences)
    # Use sine wave to create geological bedding trends
    lith_trend = np.sin(depth / 100) 
    # Add noise
    lith_trend += np.random.normal(0, 0.2, n_rows)
    
    # 3. Gamma Ray (GR) Generation
    # If trend > 0, tends towards Shale (High GR), else Sand (Low GR)
    gr = np.where(lith_trend > 0, 
                  np.random.normal(100, 15, n_rows), # Shale
                  np.random.normal(30, 10, n_rows))  # Sand
    gr = np.clip(gr, 10, 200) # Clip physical limits
    
    # 4. Resistivity (ILD) Generation
    # Anti-correlated with GR generally, but high in Hydrocarbon sands
    # Using log-normal distribution base
    ild = np.exp(np.random.normal(1, 0.5, n_rows))
    
    # Modify ILD based on lithology: Shales usually low res, Sands can be high or low
    mask_sand = gr < 60
    # Add "pay zones" (High Res Sands) randomly
    pay_zone_boost = np.where((mask_sand) & (np.random.rand(n_rows) > 0.8), 20, 1)
    ild = ild * pay_zone_boost
    ild = np.clip(ild, 0.2, 2000)

    # 5. Sonic (DT) Generation (Target Variable)
    # Wyllie's time average equation logic roughly:
    # DT_matrix (Sand) ~ 55, DT_fluid ~ 189. 
    # Shale usually slower (higher DT) ~ 90-140.
    
    # Base DT from GR (Lithology)
    dt = 50 + (0.5 * gr) 
    
    # Compaction trend: DT decreases as Depth increases
    compaction_factor = (depth - 1000) / 20000 * 20 
    dt = dt - compaction_factor
    
    # Physics relation: High resistivity (tight rock) often means lower DT (faster)
    dt = dt - (np.log10(ild) * 2)
    
    # Add random noise (measurement error)
    dt += np.random.normal(0, 2, n_rows)
    
    # Create DataFrame
    df = pd.DataFrame({
        'DEPTH': depth,
        'GR': gr,
        'ILD': ild,
        'DT': dt
    })
    
    return df

# Generate and Save
df = generate_synthetic_logs()
df.to_csv('well_logs.csv', index=False)
print(f"Dataset generated: {df.shape[0]} rows. Saved as 'well_logs.csv'.")

Dataset generated: 25000 rows. Saved as 'well_logs.csv'.
