# Spacing Statistics

## 1. Importing / Installing Packages

In [1]:
import os # Importing os module for operating system dependent functionality

import pandas as pd # Importing pandas package

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)

import numpy as np # Importing numpy package

from typing import Dict, Tuple, List, Union, Optional # Importing specific types from typing module

from src.utils import DatabricksOdbcConnector # Importing DatabricksOdbcConnector class from database_manager module

from pyproj import Geod # Importing Geod from pyproj package

from tqdm import tqdm # Importing tqdm for progress bar functionality

from joblib import Parallel, delayed # Importing Parallel and delayed for parallel processing

from matplotlib import pyplot as plt # Importing pyplot from matplotlib for plotting

# Setting matplotlib to inline mode for Jupyter notebooks
%matplotlib inline

%config InlineBackend.figure_format = 'svg' # Configuring inline backend to use SVG format for figures

from src.well_data import WellDataLoader, GeoSurveyProcessor # Importing custom classes for well data management

from src.utils import reorder_columns # Importing utility function to reorder DataFrame columns

from sklearn.neighbors import radius_neighbors_graph # Importing radius_neighbors_graph from sklearn for nearest neighbors graph construction
from sklearn.cluster import AgglomerativeClustering # Importing AgglomerativeClustering for hierarchical clustering

## 2. Defining Functions

### 2.1. Defining Functions that is used in calculation for i-k pair dataframe and Spacing Stats

In [2]:
class WellSpacingCalculator:
    """
    Class for calculating well spacing metrics and directional relationships using
    3D lateral midpoint alignment and curvature-aware distances.
    Midpoints are projected in 2D space to remove lateral-length bias when calculating spacing.
    """

    def __init__(self, trajectories: Union[Dict[str, pd.DataFrame], pd.DataFrame]):
        if isinstance(trajectories, pd.DataFrame):
            if "uwi" not in trajectories.columns:
                raise ValueError("Trajectory DataFrame must contain 'uwi' column.")
            self._trajectory_df = trajectories.reset_index(drop=True)
            self.trajectories = {
                cid: group for cid, group in self._trajectory_df.groupby("uwi")
            }
        elif isinstance(trajectories, dict):
            self.trajectories = trajectories
            self._trajectory_df = pd.concat(
                trajectories.values(), keys=trajectories.keys()
            ).reset_index(drop=True)
        else:
            raise ValueError("Invalid type for trajectories. Must be DataFrame or Dict.")

    def _apply_vectorized_geod(
        self,
        lat1: np.ndarray,
        lon1: np.ndarray,
        lat2: np.ndarray,
        lon2: np.ndarray
    ) -> np.ndarray:
        """
        Vectorized geodetic distance calculation between two arrays of (lat1, lon1) and (lat2, lon2).
        Returns distance in feet.
        """
        geod = Geod(ellps="WGS84")
        _, _, dist_m = geod.inv(lon1, lat1, lon2, lat2)
        dist_ft = dist_m * 3.28084  # Convert meters to feet
        return dist_ft

    def _compute_normalized_midpoints(self, frac: float = 0.5, use_interpolation: bool = True) -> pd.DataFrame:
        """
        Computes midpoints for each well either by interpolating along the well trajectory
        using MD-based fractional position or by averaging heel and toe coordinates.

        Parameters:
        -----------
        frac : float
            Fractional location along the lateral to compute the midpoint (0.0 to 1.0).
        use_interpolation : bool
            If True, uses curvature-aware interpolation along MD.
            If False, uses geometric midpoint between heel and toe.

        Returns:
        --------
        pd.DataFrame indexed by 'uwi', containing:
            ['x', 'y', 'tvd', 'latitude', 'longitude']
        """
        df = self._trajectory_df.copy()
        df = df.sort_values(["uwi", "md"]).reset_index(drop=True)

        if not use_interpolation:
            # Simple geometric midpoint (fast)
            heel_toe_df = (
                df.groupby("uwi")
                .agg(
                    heel_x=("x", "first"),
                    heel_y=("y", "first"),
                    heel_tvd=("tvd", "first"),
                    heel_lat=("latitude", "first"),
                    heel_lon=("longitude", "first"),
                    toe_x=("x", "last"),
                    toe_y=("y", "last"),
                    toe_tvd=("tvd", "last"),
                    toe_lat=("latitude", "last"),
                    toe_lon=("longitude", "last"),
                )
            )

            midpoint_df = pd.DataFrame({
                "x": (heel_toe_df["heel_x"] + heel_toe_df["toe_x"]) / 2,
                "y": (heel_toe_df["heel_y"] + heel_toe_df["toe_y"]) / 2,
                "tvd": (heel_toe_df["heel_tvd"] + heel_toe_df["toe_tvd"]) / 2,
                "latitude": (heel_toe_df["heel_lat"] + heel_toe_df["toe_lat"]) / 2,
                "longitude": (heel_toe_df["heel_lon"] + heel_toe_df["toe_lon"]) / 2,
            })
            midpoint_df.index.name = "uwi"
            return midpoint_df

        # Interpolated midpoint (MD-based)
        min_md = df.groupby("uwi")["md"].transform("min")
        max_md = df.groupby("uwi")["md"].transform("max")
        df["normalized_md"] = (df["md"] - min_md) / (max_md - min_md)

        df["row_index"] = df.groupby("uwi").cumcount()
        df["prev_idx"] = df.groupby("uwi")["normalized_md"].transform(lambda x: x.searchsorted(frac, side="right") - 1)
        df["next_idx"] = df["prev_idx"] + 1
        df["next_idx"] = np.minimum(df["next_idx"], df["row_index"].groupby(df["uwi"]).transform("max"))

        df_prev = df.groupby("uwi").apply(lambda g: g.loc[g["row_index"] == g["prev_idx"].iloc[0]]).reset_index(drop=True)
        df_next = df.groupby("uwi").apply(lambda g: g.loc[g["row_index"] == g["next_idx"].iloc[0]]).reset_index(drop=True)

        merged = pd.merge(df_prev, df_next, on="uwi", suffixes=("_prev", "_next"))

        delta = merged["normalized_md_next"] - merged["normalized_md_prev"]
        delta = delta.replace(0, np.nan)
        ratio = (frac - merged["normalized_md_prev"]) / delta

        midpoint_df = pd.DataFrame({
            "x": merged["x_prev"] + ratio * (merged["x_next"] - merged["x_prev"]),
            "y": merged["y_prev"] + ratio * (merged["y_next"] - merged["y_prev"]),
            "tvd": merged["tvd_prev"] + ratio * (merged["tvd_next"] - merged["tvd_prev"]),
            "latitude": merged["latitude_prev"] + ratio * (merged["latitude_next"] - merged["latitude_prev"]),
            "longitude": merged["longitude_prev"] + ratio * (merged["longitude_next"] - merged["longitude_prev"]),
        })
        midpoint_df["uwi"] = merged["uwi"]
        return midpoint_df.set_index("uwi")

    def _compute_drill_directions(self) -> pd.Series:
        median_azimuth = self._trajectory_df.groupby("uwi")["azimuth"].median()
        is_ew = ((median_azimuth >= 45) & (median_azimuth <= 135)) | ((median_azimuth >= 225) & (median_azimuth <= 315))
        return pd.Series(np.where(is_ew, "EW", "NS"), index=median_azimuth.index, name="drill_direction")

    def _filter_close_pairs(self, lat: np.ndarray, lon: np.ndarray, max_distance_miles: float = 20.0) -> Tuple[np.ndarray, np.ndarray]:

        lat1, lat2 = np.meshgrid(lat, lat, indexing="ij")
        lon1, lon2 = np.meshgrid(lon, lon, indexing="ij")

        delta_lat = np.abs(lat1 - lat2)
        delta_lon = np.abs(lon1 - lon2)

        miles_per_lat_degree = 69.0
        miles_per_lon_degree = 69.0 * np.cos(np.radians(lat))
        miles_per_lon_degree_matrix = np.add.outer(miles_per_lon_degree, miles_per_lon_degree) / 2.0

        rough_dist_miles = np.sqrt(
            (delta_lat * miles_per_lat_degree)**2 + (delta_lon * miles_per_lon_degree_matrix)**2
        )

        mask = (rough_dist_miles <= max_distance_miles) & (delta_lat + delta_lon > 0)
        i_idx, k_idx = np.where(mask)

        return i_idx, k_idx

    def _get_pairwise_indices(self, uwis: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate all valid pairwise (i, k) UWI combinations from an array of well IDs,
        excluding self-comparisons (i != k).

        Parameters
        ----------
        uwis : np.ndarray
            Array of unique well identifiers.

        Returns
        -------
        Tuple[np.ndarray, np.ndarray]
            Two 1D arrays of i and k UWIs representing all valid (i, k) pairs.
        """
        # Generate meshgrid of al possible UWI pairs
        n = len(uwis)
        i_idx, k_idx = np.meshgrid(np.arange(n), np.arange(n), indexing="ij")
            
        # Exclude self-comparisons (where i_uwi == k_uwi)
        valid_mask = i_idx != k_idx

        return i_idx[valid_mask], k_idx[valid_mask]
    
    def _get_relative_cardinal_direction(
        self,
        lat: np.ndarray,
        lon: np.ndarray,
        i_idx: np.ndarray,
        k_idx: np.ndarray
    ) -> np.ndarray:
        lat1, lat2 = lat[i_idx], lat[k_idx]
        lon1, lon2 = lon[i_idx], lon[k_idx]

        lat_diff = lat1 - lat2
        lon_diff = lon1 - lon2

        vertical = np.abs(lat_diff) > np.abs(lon_diff)
        is_south = lat_diff > 0
        is_west = lon_diff > 0

        return np.select(
            [vertical & is_south, vertical & ~is_south, ~vertical & is_west, ~vertical & ~is_west],
            ["S", "N", "W", "E"]
        )
    
    def _process_batch(self, i_idx: np.ndarray, k_idx: np.ndarray, ids: np.ndarray, coords: np.ndarray, lat_lon: np.ndarray, directions: np.ndarray, curvature_threshold_ft: float, use_geod: bool) -> pd.DataFrame:
        i_uwi = ids[i_idx]
        k_uwi = ids[k_idx]

        horizontal, vertical, dist3d = self._calculate_distances(coords, lat_lon, i_idx, k_idx, curvature_threshold_ft, use_geod)
        direction_to_k_from_i  = self._get_relative_cardinal_direction(lat_lon[:, 0], lat_lon[:, 1], i_idx, k_idx)

        return pd.DataFrame({
            "well_i": i_uwi,
            "well_k": k_uwi,
            "horizontal_dist": horizontal,
            "vertical_dist": vertical,
            "3D_dist": dist3d,
            "drill_direction_i": directions[i_idx],
            "drill_direction_k": directions[k_idx],
            "direction_to_k_from_i": direction_to_k_from_i
        })
    
    def _batch_filtered_indices(self, pairs: List[Tuple[int, int]], batch_size: int = 1_000_000):
        """
        Vectorized batching of prefiltered well pairs.

        Parameters
        ----------
        pairs : List[Tuple[int, int]]
            List of (i_idx, k_idx) pairs.
        batch_size : int
            Number of pairs per batch.

        Yields
        ------
        Tuple[np.ndarray, np.ndarray]
            i_idx and k_idx arrays for each batch.
        """
        pairs_array = np.array(pairs)  # Convert list of tuples directly to 2D array (N, 2)
        n_pairs = pairs_array.shape[0]

        # Vectorized slicing
        split_indices = np.arange(0, n_pairs, batch_size)

        for start_idx in split_indices:
            end_idx = min(start_idx + batch_size, n_pairs)
            batch = pairs_array[start_idx:end_idx]
            yield batch[:, 0], batch[:, 1]

    def _calculate_distances(
    self,
    coords: np.ndarray,
    lat_lon: np.ndarray,
    i_idx: np.ndarray,
    k_idx: np.ndarray,
    curvature_threshold_ft: float,
    use_geod: bool
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Calculates horizontal, vertical, and 3D distances between well pairs
        based on normalized midpoint coordinates.
        """
        A = coords[i_idx]  # shape (N, 3): x, y, tvd
        B = coords[k_idx]

        B_aligned = B.copy()

        B_aligned[:, 1] = A[:, 1]  # Force y of B to match y of A

        dx = B_aligned[:, 0] - A[:, 0]
        dy = B_aligned[:, 1] - A[:, 1]
        dz = B_aligned[:, 2] - A[:, 2]

        horizontal = np.sqrt(dx**2 + dy**2)
        vertical = np.abs(dz)
        dist3d = np.sqrt(horizontal**2 + vertical**2)

        if use_geod:
            long_mask = horizontal > curvature_threshold_ft
            if np.any(long_mask):
                lat = lat_lon[:, 0]
                lon = lat_lon[:, 1]
                lat1 = lat[i_idx[long_mask]]
                lon1 = lon[i_idx[long_mask]]
                lat2 = lat[k_idx[long_mask]]
                lon2 = lon[k_idx[long_mask]]

                # 🔥 Now call the optimized function!
                horizontal[long_mask] = self._apply_vectorized_geod(lat1, lon1, lat2, lon2)

                # Update 3D distance with corrected horizontal
                dist3d[long_mask] = np.sqrt(horizontal[long_mask]**2 + vertical[long_mask]**2)

        return horizontal, vertical, dist3d

    def _calculate_spacing_statistics(
    self,
    curvature_threshold_ft: float = 26400.0,
    use_geod: bool = True,
    frac: float = 0.5,
    batch_size: int = 1_000_000,
    max_distance_miles: Optional[float] = 20.0,
    save_batches_dir: Optional[str] = None,
    use_interpolation: bool = True
) -> Optional[pd.DataFrame]:
        """
        Compute spacing distances between well pairs using lateral midpoints.

        Parameters:
        -----------
        curvature_threshold_ft : float
            Use geodetic correction above this horizontal threshold (default 5 miles).
        use_geod : bool
            Whether to apply geodetic distance correction.
        frac : float
            Fractional position along the lateral for midpoint interpolation (default 0.5).
        batch_size : int
            Number of pairwise calculations to process per batch.
        max_distance_miles : float or None
            Max rough distance filter to exclude far-away well pairs.
        save_batches_dir : str or None
            If provided, saves each batch to a separate Parquet file.
        use_interpolation : bool
            Whether to interpolate midpoints based on MD (curvature-aware).
            If False, computes geometric midpoints between heel and toe.

        Returns:
        --------
        pd.DataFrame or None
            If saving to disk, returns None. Otherwise, returns spacing results DataFrame.
        """
        # 1. Get normalized lateral midpoints and drill directions
        midpoint_df = self._compute_normalized_midpoints(frac=frac, use_interpolation=use_interpolation)
        drill_dirs = self._compute_drill_directions()
        midpoint_df["drill_direction"] = drill_dirs

        # 2. Prepare arrays
        ids = midpoint_df.index.to_numpy()
        coords = midpoint_df[["x", "y", "tvd"]].to_numpy()
        lat_lon = midpoint_df[["latitude", "longitude"]].to_numpy()
        directions = midpoint_df["drill_direction"].to_numpy()

        if max_distance_miles is not None:
            lat = lat_lon[:, 0]
            lon = lat_lon[:, 1]
            i_idx, k_idx = self._filter_close_pairs(lat, lon, max_distance_miles)
        else:
            i_idx, k_idx = self._get_pairwise_indices(ids)

        pairs = list(zip(i_idx, k_idx))
        batch_generator = list(self._batch_filtered_indices(pairs, batch_size=batch_size))
        n_batches = len(batch_generator)

        if save_batches_dir:
            os.makedirs(save_batches_dir, exist_ok=True)

        def process_and_save(batch_number: int, i_idx: np.ndarray, k_idx: np.ndarray):
            
            batch_df = self._process_batch(i_idx, k_idx, ids, coords, lat_lon, directions, curvature_threshold_ft, use_geod)
            
            if save_batches_dir:
                filepath = os.path.join(save_batches_dir, f"spacing_batch_{batch_number:04d}.parquet")
                batch_df.to_parquet(filepath, index=False)
            return batch_df
        
        tqdm_kwargs = {
            "desc": "🚀 Calculating Spacing (Parallel)",
            "dynamic_ncols": True, # Auto-adjust width to terminal
            "smoothing": 0.3, # Smoothing factor for progress bar
            "bar_format": "{desc}: |{bar:40}| {percentage:3.0f}% {n_fmt}/{total_fmt} [{elapsed}<{remaining}]", # Custom bar format
            "ascii": "░▒█", # Use custom ASCII characters for the bar
            "leave": True, # Leave the progress bar on completion
        }
        
        results = Parallel(n_jobs=-1)(
            delayed(process_and_save)(batch_num, i_idx, k_idx)
            for batch_num, (i_idx, k_idx) in tqdm(enumerate(batch_generator), total=n_batches, **tqdm_kwargs)
        )

        if save_batches_dir:
            print(f"✅ All batches saved to {save_batches_dir}")
            return None
        else:
            return pd.concat(results, ignore_index=True)

    def _load_saved_batches(self, batch_folder: str) -> pd.DataFrame:
        """
        Load all saved spacing batch Parquet files from a folder and combine into a single DataFrame.

        Parameters
        ----------
        batch_folder : str
            Path to the folder where batch Parquet files are stored.

        Returns
        -------
        pd.DataFrame
            Combined spacing DataFrame.
        """
        if not os.path.isdir(batch_folder):
            raise FileNotFoundError(f"Batch folder '{batch_folder}' not found.")

        batch_files = sorted([
            os.path.join(batch_folder, f)
            for f in os.listdir(batch_folder)
            if f.endswith(".parquet")
        ])

        if not batch_files:
            raise ValueError(f"No Parquet files found in folder '{batch_folder}'.")

        print(f"🔍 Found {len(batch_files)} batch files. Loading and combining...")

        dfs = []
        for file in batch_files:
            dfs.append(pd.read_parquet(file))

        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"✅ Loaded {len(combined_df):,} rows from all batches.")
        return combined_df

    def _summarize_spacing_statistics(self, spacing_df: pd.DataFrame) -> Dict[str, float]:
        avg_spacing = spacing_df["3D_dist"].mean()
        spacing_var = spacing_df["3D_dist"].var()
        avg_horizontal = spacing_df["horizontal_dist"].mean()
        acres_per_well = avg_horizontal ** 2 / 43_560
        density = 1 / acres_per_well if acres_per_well > 0 else float("nan")
        return {
            "average_spacing_3D_ft": avg_spacing,
            "spacing_variance": spacing_var,
            "estimated_well_density_per_acre": density
        }

    def _plot_spacing_cdf(self, spacing_df: pd.DataFrame, column: str = "3D_dist") -> None:
        values = np.sort(spacing_df[column].dropna())
        cdf = np.linspace(0, 1, len(values))
        plt.figure(figsize=(8, 5))
        plt.plot(values, cdf, label=f"CDF of {column}")
        plt.xlabel(f"{column} (ft)")
        plt.ylabel("Cumulative Probability")
        plt.title(f"CDF of Well {column.replace('_', ' ').title()}")
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.show()

    def _plot_spacing_histogram(self, spacing_df: pd.DataFrame, column: str = "3D_dist", bins: int = 30) -> None:
        plt.figure(figsize=(8, 5))
        plt.hist(spacing_df[column].dropna(), bins=bins, edgecolor='k', alpha=0.7)
        plt.xlabel(f"{column} (ft)")
        plt.ylabel("Frequency")
        plt.title(f"Histogram of {column.replace('_', ' ').title()}")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    def _print_spacing_percentiles(self, spacing_df: pd.DataFrame, column: str = "3D_dist") -> Dict[str, float]:
        percentiles = spacing_df[column].quantile([0.1, 0.5, 0.9]).to_dict()
        return {
            "P10": percentiles.get(0.1, float("nan")),
            "P50": percentiles.get(0.5, float("nan")),
            "P90": percentiles.get(0.9, float("nan"))
        }

    def _filter_spacing_by_metadata(self, spacing_df: pd.DataFrame, metadata_df: pd.DataFrame, by: str = "operator") -> pd.DataFrame:
        group_map = metadata_df.set_index("uwi")[by].to_dict()
        spacing_df = spacing_df.copy()
        spacing_df["group_i"] = spacing_df["well_i"].map(group_map)
        spacing_df["group_k"] = spacing_df["well_k"].map(group_map)
        return spacing_df[spacing_df["group_i"] == spacing_df["group_k"]].drop(columns=["group_i", "group_k"])

    def _group_spacing_summary(
        self,
        spacing_df: pd.DataFrame,
        metadata_df: pd.DataFrame,
        by: str = "operator",
        return_group_dataframes: bool = False
    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]]:
        """
        Computes group-level well spacing statistics by applying summarize_spacing_statistics
        to each group. Ensures consistent metric calculation and optionally returns group-level
        spacing DataFrames for downstream analysis or plotting.

        Parameters
        ----------
        spacing_df : pd.DataFrame
            DataFrame of well pair distances with columns:
            ['well_i', 'well_k', 'horizontal_dist', 'vertical_dist', '3D_dist']

        metadata_df : pd.DataFrame
            Metadata table mapping 'uwi' to grouping information like operator or basin.
            Must contain columns ['uwi', <by>].

        by : str, optional
            Column in metadata_df used for grouping, by default "operator".

        return_group_dataframes : bool, optional
            If True, also returns a dictionary of spacing DataFrames per group.

        Returns
        -------
        pd.DataFrame
            A DataFrame with spacing summaries per group, containing:
            ['average_spacing_3D_ft', 'spacing_variance', 'estimated_well_density_per_acre', 'count']

        Tuple[pd.DataFrame, Dict[str, pd.DataFrame]], optional
            If return_group_dataframes is True, also returns a dictionary mapping group names
            to filtered spacing DataFrames.
        """
        group_map = metadata_df.set_index("uwi")[by].to_dict()
        spacing_df = spacing_df.copy()
        spacing_df["group_i"] = spacing_df["well_i"].map(group_map)
        spacing_df["group_k"] = spacing_df["well_k"].map(group_map)

        # Filter for intra-group spacing pairs
        spacing_df = spacing_df[spacing_df["group_i"] == spacing_df["group_k"]]
        spacing_df["group"] = spacing_df["group_i"]
        spacing_df = spacing_df.drop(columns=["group_i", "group_k"])

        # Group by and apply summarize_spacing_statistics
        group_summaries = []
        group_dataframes = {}

        for group_name, group_df in spacing_df.groupby("group"):
            summary = self._summarize_spacing_statistics(group_df)
            summary[by] = group_name
            summary["count"] = len(group_df)
            group_summaries.append(summary)

            if return_group_dataframes:
                group_dataframes[group_name] = group_df.reset_index(drop=True)

        summary_df = pd.DataFrame(group_summaries)[[by, "average_spacing_3D_ft", "spacing_variance", "estimated_well_density_per_acre", "count"]]

        if return_group_dataframes:
            return summary_df, group_dataframes

        return summary_df


## 3. Loading Header and GeoSurvey either from Excel/csv/SQL into Pandas DataFrame

In [3]:
loader = WellDataLoader(db = DatabricksOdbcConnector(), 
                        log_dir=r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\logs")

In [5]:
df_MB_header = loader.get_header_data(basin="MB", start_year=2016)

[WellDataLoaderLogger] INFO (05-02 10:18 PM): Loading header data from SQL. (Line: 84) [well_data_manager.py]

  result_df = pd.read_sql(sql_query, self.connection)


In [6]:
df_MB_directional = loader.get_directional_data()

[WellDataLoaderLogger] INFO (05-02 10:18 PM): Loading directional data from SQL. (Line: 104) [well_data_manager.py]



In [7]:
processor = GeoSurveyProcessor(log_dir=r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\logs")

[GeoLogger] INFO (05-02 10:28 PM): GeoSurveyProcessor initialized. (Line: 185) [well_data_manager.py]



In [8]:
df_utm = processor.compute_utm_coordinates(df=df_MB_directional)

[GeoLogger] INFO (05-02 10:28 PM): ✅ Using lat/lon from input DataFrame. (Line: 261) [well_data_manager.py]

[GeoLogger] INFO (05-02 10:28 PM): ✅ UTM coordinate computation complete in 6.63 sec. (Line: 316) [well_data_manager.py]



In [9]:
df_utm_lateral = processor.filter_after_heel_point(df=df_utm)

In [10]:
spacing_calculator = WellSpacingCalculator(trajectories=df_utm_lateral)

In [11]:
# df_spacing_new = spacing_calculator._calculate_spacing_statistics(
#     batch_size=500_000,
#     max_distance_miles=10,  # Adjust as needed for your dataset
#     save_batches_dir = None,  # Specify directory to save batches,
#     use_interpolation=False  # Set to True if you want to use MD-based interpolation for midpoints
# )

In [12]:
df_spacing_old = spacing_calculator._load_saved_batches(batch_folder="spacing_batches_MB")

🔍 Found 56 batch files. Loading and combining...
✅ Loaded 27,557,842 rows from all batches.


In [13]:
# df_spacing_new['well_i_10'] = df_spacing_new['well_i'].str[:10] # Extracting first 10 characters of 'well_i' column
# df_spacing_new['well_k_10'] = df_spacing_new['well_k'].str[:10] # Extracting first 10 characters of 'well_k' column
# df_spacing_new = reorder_columns(df_spacing_new, ['well_i_10','well_k_10'], 'well_k') # Reordering columns to have 'well_i_10' and 'well_k_10' at the beginning

# df_spacing_old['well_i_10'] = df_spacing_old['well_i'].str[:10] # Extracting first 10 characters of 'well_i' column
# df_spacing_old['well_k_10'] = df_spacing_old['well_k'].str[:10] # Extracting first 10 characters of 'well_k' column
# df_spacing_old = reorder_columns(df_spacing_old, ['well_i_10','well_k_10'], 'well_k') # Reordering columns to have 'well_i_10' and 'well_k_10' at the beginning

In [33]:
df_utm_lateral

Unnamed: 0,uwi,md,tvd,inclination,azimuth,latitude,longitude,deviation_E/W,E/W,deviation_N/S,N/S,point_type_name,x,y,utm_zone,epsg_code,z
0,42003463520000,9165.0,8937.50,81.81,163.87,32.353239,-102.251110,107.85,EAST,553.79,SOUTH,80 DEGREE HEEL POINT,2.489139e+06,1.175496e+07,13,EPSG:32613,-8937.50
1,42003463520000,9196.0,8941.50,83.34,163.71,32.353159,-102.251082,116.43,EAST,583.31,SOUTH,,2.489149e+06,1.175494e+07,13,EPSG:32613,-8941.50
2,42003463520000,9228.0,8944.87,84.58,163.42,32.353075,-102.251053,125.43,EAST,613.83,SOUTH,,2.489159e+06,1.175491e+07,13,EPSG:32613,-8944.87
3,42003463520000,9323.0,8952.87,85.76,163.42,32.352827,-102.250965,152.44,EAST,704.56,SOUTH,,2.489188e+06,1.175482e+07,13,EPSG:32613,-8952.87
4,42003463520000,9417.0,8959.31,86.38,162.61,32.352581,-102.250877,179.84,EAST,794.25,SOUTH,,2.489218e+06,1.175473e+07,13,EPSG:32613,-8959.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2444342,42501375580000,10405.0,5277.04,87.95,181.17,33.052503,-102.878194,594.37,WEST,5546.07,SOUTH,,2.290391e+06,1.200495e+07,13,EPSG:32613,-5277.04
2444343,42501375580000,10493.0,5279.92,88.30,183.17,33.052261,-102.878190,597.70,WEST,5633.96,SOUTH,,2.290394e+06,1.200486e+07,13,EPSG:32613,-5279.92
2444344,42501375580000,10582.0,5280.66,90.75,184.17,33.052017,-102.878194,603.39,WEST,5722.76,SOUTH,,2.290395e+06,1.200477e+07,13,EPSG:32613,-5280.66
2444345,42501375580000,10643.0,5280.07,90.35,183.37,33.051849,-102.878197,607.40,WEST,5783.63,SOUTH,,2.290395e+06,1.200471e+07,13,EPSG:32613,-5280.07


In [75]:
def calculate_overlap(well_A: pd.DataFrame, well_B: pd.DataFrame) -> float:
    """
    Calculate the percentage of measured depth (MD) overlap between two horizontal wellbores.

    This function computes the length of the overlapping interval between the two wells'
    measured depth ranges and expresses it as a percentage of the shorter lateral length.
    It is typically used to assess lateral adjacency between horizontal wells.

    Parameters
    ----------
    well_A : pd.DataFrame
        Trajectory data for the first well. Must contain a column named 'md' representing measured depth (in feet).
        The DataFrame should represent the lateral portion of the well only.
    
    well_B : pd.DataFrame
        Trajectory data for the second well. Must contain a column named 'md' representing measured depth (in feet).
        The DataFrame should represent the lateral portion of the well only.

    Returns
    -------
    float
        The percentage of overlap between the two MD intervals, relative to the shorter well's lateral length.
        Returns 0.0 if either well is empty, lacks 'MD', or has no overlapping interval.

    Examples
    --------
    >>> calculate_overlap(well_A_df, well_B_df)
    67.5
    """

    start_A, end_A = well_A["md"].min(), well_A["md"].max()
    start_B, end_B = well_B["md"].min(), well_B["md"].max()

    overlap_start = max(start_A, start_B)
    overlap_end = min(end_A, end_B)

    if overlap_start >= overlap_end:
        return 0.0

    overlap_length = overlap_end - overlap_start
    shorter_length = min(end_A - start_A, end_B - start_B)

    return (overlap_length / shorter_length) * 100 if shorter_length > 0 else 0.0


In [76]:
def calculate_lateral_adjacency_bounds(
    spacing_df: pd.DataFrame,
    trajectory_data: Union[pd.DataFrame, Dict[str, pd.DataFrame]],
    vertical_cutoff: float = 125.0,
    horizontal_cutoff: float = 1800.0
) -> pd.DataFrame:
    """
    For each well_i, find the closest E and W well_k neighbors by horizontal distance.
    Compute lateral overlap % only if vertical/horizontal cutoffs are satisfied.
    Returns the highest overlap as bound_1 and the second as bound_2.

    Parameters
    ----------
    spacing_df : pd.DataFrame
        Spacing output from WellSpacingCalculator, with at least:
        ['well_i', 'well_k', 'horizontal_dist', 'vertical_dist', 'direction_to_k_from_i']
    
    trajectory_data : Union[pd.DataFrame, Dict[str, pd.DataFrame]]
        Either:
            - A pre-filtered lateral-only DataFrame with a 'uwi' column and 'md'
            - OR a dictionary mapping UWI (str) to its corresponding trajectory DataFrame
    
    vertical_cutoff : float
        Maximum vertical distance (ft) allowed to compute lateral overlap
    
    horizontal_cutoff : float
        Maximum horizontal distance (ft) allowed to compute lateral overlap

    Returns
    -------
    pd.DataFrame
        Contains:
        ['well_i', 'bound_1', 'bound_1_uwi', 'bound_1_direction',
                     'bound_2', 'bound_2_uwi', 'bound_2_direction']
    """
    # Step 1: Convert lateral DataFrame to dict if needed
    if isinstance(trajectory_data, pd.DataFrame):
        if "uwi" not in trajectory_data.columns or "md" not in trajectory_data.columns:
            raise ValueError("Trajectory DataFrame must contain 'uwi' and 'md' columns.")
        
        trajectory_dict = {
            uwi: df.drop(columns=["uwi"], errors="ignore").reset_index(drop=True)
            for uwi, df in trajectory_data.groupby("uwi")
        }
    elif isinstance(trajectory_data, dict):
        trajectory_dict = trajectory_data
    else:
        raise ValueError("trajectory_data must be a DataFrame or a dictionary.")

    # Step 2: Filter for E/W directions only
    ew_df = spacing_df[spacing_df["direction_to_k_from_i"].isin(["E", "W"])].copy()

    # Step 3: Sort by well_i, direction, horizontal_dist to find nearest neighbor
    ew_df = ew_df.sort_values(["well_i", "direction_to_k_from_i", "horizontal_dist"])

    # Step 4: Get closest E and W well_k for each well_i
    closest_df = ew_df.groupby(["well_i", "direction_to_k_from_i"]).first()
    all_well_ids = spacing_df["well_i"].unique()
    output = pd.DataFrame(index=pd.Index(all_well_ids, name="well_i"))

    for direction, suffix in zip(["E", "W"], ["1", "2"]):
        try:
            dir_df = closest_df.xs(direction, level="direction_to_k_from_i", drop_level=False)
        except KeyError:
            # No wells in this direction
            continue

        overlaps, uwis, dirs, idxs = [], [], [], []

        for well_key, row in dir_df.iterrows():
            well_i = well_key[0] if isinstance(well_key, tuple) else well_key
            well_k = row["well_k"]
            h_dist, v_dist = row["horizontal_dist"], row["vertical_dist"]

            if h_dist <= horizontal_cutoff and v_dist <= vertical_cutoff:
                traj_i = trajectory_dict.get(str(well_i), pd.DataFrame())
                traj_k = trajectory_dict.get(str(well_k), pd.DataFrame())
                percent = calculate_overlap(traj_i, traj_k)
                overlaps.append(percent)
                uwis.append(str(well_k))
                dirs.append(direction)
            else:
                overlaps.append(None)
                uwis.append(None)
                dirs.append(None)

            idxs.append(well_i)

        # Construct and merge partial result
        partial_df = pd.DataFrame({
            f"bound_{suffix}": overlaps,
            f"bound_{suffix}_uwi": uwis,
            f"bound_{suffix}_direction": dirs
        }, index=idxs)

        output = output.join(partial_df, how="left")

    # Step 5: Reorder so that bound_1 is the higher of the two
    reordered = output.copy()
    swap_mask = reordered["bound_2"].fillna(0) > reordered["bound_1"].fillna(0)

    for col in ["bound_1", "bound_1_uwi", "bound_1_direction"]:
        alt_col = col.replace("1", "2")
        reordered[col], reordered[alt_col] = (
            reordered[alt_col].where(swap_mask, reordered[col]),
            reordered[col].where(swap_mask, reordered[alt_col])
        )

    return reordered.reset_index()

In [77]:
lateral_adjacency_df = calculate_lateral_adjacency_bounds(
    spacing_df = df_spacing_old.rename(columns={"direction_from_i_to_k": "direction_to_k_from_i"}),  # Renaming 'well_i' to 'uwi' for consistency
    trajectory_data = df_utm_lateral,  # pre-filtered lateral-only with 'uwi' and 'md'
    vertical_cutoff=125,
    horizontal_cutoff=1800
)

In [78]:
lateral_adjacency_df

Unnamed: 0,well_i,bound_1,bound_1_uwi,bound_1_direction,bound_2,bound_2_uwi,bound_2_direction
0,42003463520000,,,,,,
1,42003468560100,,,,,,
2,42003469560000,,,,,,
3,42003469570000,,,,,,
4,42003470430000,99.617293,42003473220000,W,,,
...,...,...,...,...,...,...,...
20913,42501375310000,100.000000,42501375300000,W,99.305556,42501375320000,E
20914,42501375320000,99.305556,42501375310000,W,,,
20915,42501375330000,,,,,,
20916,42501375540000,97.444946,42501368370000,E,96.812526,42501366520000,W


In [84]:
df_spacing_old[(df_spacing_old['well_i'] == "42003475850100") & (df_spacing_old['well_k'].str.startswith("4200348756")) & (df_spacing_old['direction_from_i_to_k'].isin(["E", "W"]))].sort_values("horizontal_dist")

Unnamed: 0,well_i,well_k,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k


In [None]:
4200348756

91.54177786902073