In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score

try:
    import segyio
    SEG_AVAILABLE = True
except:
    SEG_AVAILABLE = False

print("Environment ready")


segy_path = "/content/seismic.sgy"
use_real_seismic = SEG_AVAILABLE and os.path.exists(segy_path)


if use_real_seismic:
    print("SEG-Y detected. Loading seismic volume...")

    with segyio.open(segy_path, "r", ignore_geometry=True) as f:
        f.mmap()
        cube = segyio.tools.cube(f)

    inline_index = cube.shape[0] // 2

    plt.figure(figsize=(10, 6))
    plt.imshow(
        cube[inline_index, :, :].T,
        cmap="seismic",
        aspect="auto"
    )
    plt.gca().invert_yaxis()
    plt.colorbar(label="Amplitude")
    plt.title("Inline seismic section")
    plt.tight_layout()
    plt.show()

    rms_amplitude = np.sqrt(np.mean(cube ** 2, axis=2))
    flat_rms = rms_amplitude.reshape(-1, 1)

    threshold = np.percentile(flat_rms, 85)
    hydrocarbon_flag = (flat_rms.flatten() > threshold).astype(int)

    df = pd.DataFrame({
        "rms_amplitude": flat_rms.flatten(),
        "hydrocarbon": hydrocarbon_flag
    })

else:
    print("SEG-Y not found. Using physics-inspired seismic attributes.")

    np.random.seed(42)
    n_samples = 1200

    amplitude = np.random.uniform(0.1, 1.0, n_samples)
    impedance = np.random.uniform(3000, 8000, n_samples)
    frequency = np.random.uniform(10, 80, n_samples)
    velocity = np.random.uniform(1500, 6000, n_samples)
    density = np.random.uniform(2.0, 3.0, n_samples)

    hydrocarbon_flag = (
        (impedance > 6200) &
        (amplitude > 0.65) &
        (frequency < 35)
    ).astype(int)

    df = pd.DataFrame({
        "amplitude": amplitude,
        "impedance": impedance,
        "frequency": frequency,
        "velocity": velocity,
        "density": density,
        "hydrocarbon": hydrocarbon_flag
    })


print("Dataset size:", df.shape)
print(df.head())


X = df.drop("hydrocarbon", axis=1)
y = df["hydrocarbon"]

model = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    scores.append(accuracy_score(y_test, preds))

print("Cross-validation accuracy:", scores)
print("Mean accuracy:", np.mean(scores))


model.fit(X, y)
print(classification_report(y, model.predict(X)))


importances = model.feature_importances_
features = X.columns

plt.figure(figsize=(7, 5))
plt.barh(features, importances)
plt.xlabel("Importance")
plt.title("Seismic attribute importance")
plt.tight_layout()
plt.show()


if not use_real_seismic:
    plt.figure(figsize=(7, 5))
    plt.scatter(
        df["impedance"],
        df["amplitude"],
        c=df["hydrocarbon"],
        cmap="coolwarm",
        alpha=0.6
    )
    plt.xlabel("Impedance")
    plt.ylabel("Amplitude")
    plt.title("Hydrocarbon prospectivity space")
    plt.tight_layout()
    plt.show()


print("Hydrocarbon prospectivity modeling workflow completed.")