# Spacing Statistics

## 1. Importing / Installing Packages

In [1]:
import pandas as pd # Importing pandas package

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)

import numpy as np # Importing numpy package

from typing import Dict, Tuple, List, Union # Importing specific types from typing module

import re # Importing regular expression package

import time

from src.database_manager import DatabricksOdbcConnector # Importing DatabricksOdbcConnector class from database_manager module

from pyproj import Geod # Importing Geod from pyproj package

from matplotlib import pyplot as plt # Importing pyplot from matplotlib for plotting

# Setting matplotlib to inline mode for Jupyter notebooks
%matplotlib inline

%config InlineBackend.figure_format = 'svg' # Configuring inline backend to use SVG format for figures

from src.components.well_data_manager import WellDataLoader, GeoSurveyProcessor # Importing custom classes for well data management

## 2. Defining Functions

### 2.1. Defining Functions that is used in calculation for i-k pair dataframe and Spacing Stats

In [16]:
class WellSpacingCalculator:
    """
    Class for calculating well spacing metrics and directional relationships using
    3D lateral midpoint alignment and curvature-aware distances.
    Midpoints are projected in 2D space to remove lateral-length bias when calculating spacing.
    """

    def __init__(self, trajectories: Union[Dict[str, pd.DataFrame], pd.DataFrame]):
        if isinstance(trajectories, pd.DataFrame):
            if "uwi" not in trajectories.columns:
                raise ValueError("Trajectory DataFrame must contain 'uwi' column.")
            self._trajectory_df = trajectories.reset_index(drop=True)
            self.trajectories = {
                cid: group for cid, group in self._trajectory_df.groupby("uwi")
            }
        elif isinstance(trajectories, dict):
            self.trajectories = trajectories
            self._trajectory_df = pd.concat(
                trajectories.values(), keys=trajectories.keys()
            ).reset_index(drop=True)
        else:
            raise ValueError("Invalid type for trajectories. Must be DataFrame or Dict.")

    def _apply_vectorized_geod(self, lat: np.ndarray, lon: np.ndarray, mask: np.ndarray) -> np.ndarray:
        geod = Geod(ellps="WGS84")
        n = len(lat)
        lat1, lat2 = np.meshgrid(lat, lat)
        lon1, lon2 = np.meshgrid(lon, lon)
        az12, az21, dist_m = geod.inv(lon1[mask], lat1[mask], lon2[mask], lat2[mask])
        dist_ft = dist_m * 3.28084
        dist_matrix = np.zeros((n, n))
        dist_matrix[mask] = dist_ft
        return dist_matrix

    def _compute_lateral_midpoints(self) -> pd.DataFrame:
        return self._trajectory_df.groupby("uwi")[["x", "y", "tvd", "latitude", "longitude"]].mean()
    
    def _compute_drill_directions(self) -> pd.Series:
        median_azimuth = self._trajectory_df.groupby("uwi")["azimuth"].median()
        is_ew = ((median_azimuth >= 45) & (median_azimuth <= 135)) | ((median_azimuth >= 225) & (median_azimuth <= 315))
        return np.where(is_ew, "EW", "NS")

    def _get_pairwise_indices(self, n: int) -> Tuple[np.ndarray, np.ndarray]:
        i_idx, k_idx = np.meshgrid(np.arange(n), np.arange(n), indexing="ij")
        mask = i_idx != k_idx  # Remove self-comparisons
        return i_idx[mask], k_idx[mask]
    
    def _get_relative_cardinal_direction(
        self,
        lat: np.ndarray,
        lon: np.ndarray,
        i_idx: np.ndarray,
        k_idx: np.ndarray
    ) -> np.ndarray:
        lat1 = lat[i_idx]
        lat2 = lat[k_idx]
        lon1 = lon[i_idx]
        lon2 = lon[k_idx]

        lat_diff = lat1 - lat2
        lon_diff = lon1 - lon2

        vertical = np.abs(lat_diff) > np.abs(lon_diff)
        is_south = lat_diff > 0
        is_west = lon_diff > 0

        return np.select(
            [vertical & is_south, vertical & ~is_south, ~vertical & is_west, ~vertical & ~is_west],
            ["S", "N", "W", "E"]
        )

    def _calculate_distances(
        self,
        coords: np.ndarray,
        lat_lon: np.ndarray,
        i_idx: np.ndarray,
        k_idx: np.ndarray,
        curvature_threshold_ft: float,
        use_geod: bool
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        A = coords[i_idx]  # well_i (reference)
        B = coords[k_idx]  # well_k

        vertical = np.abs(A[:, 2] - B[:, 2])

        # Align B vertically to A
        B[:, 2] = A[:, 2]

        # Project B's midpoint onto A's lateral plane
        mid_x = (A[:, 0] + B[:, 0]) / 2
        mid_y = (A[:, 1] + B[:, 1]) / 2

        B[:, 0] = A[:, 0]  # overwrite B's x with A's x to force alignment
        B[:, 1] = A[:, 1]  # overwrite B's y with A's y

        dx = mid_x - A[:, 0]
        dy = mid_y - A[:, 1]
        horizontal = np.sqrt(dx ** 2 + dy ** 2)

        if use_geod:
            lat = lat_lon[:, 0] / 364_000
            lon = lat_lon[:, 1] / 364_000
            lat1, lat2 = lat[i_idx], lat[k_idx]
            lon1, lon2 = lon[i_idx], lon[k_idx]
            long_mask = horizontal > curvature_threshold_ft
            if np.any(long_mask):
                geo = Geod(ellps="WGS84")
                _, _, dist_m = geo.inv(lon1[long_mask], lat1[long_mask], lon2[long_mask], lat2[long_mask])
                horizontal[long_mask] = dist_m * 3.28084

        dist3d = np.sqrt(horizontal ** 2 + vertical ** 2)
        return horizontal, vertical, dist3d

    def calculate_spacing_statistics(self, curvature_threshold_ft: float = 26400.0, use_geod: bool = True) -> pd.DataFrame:
        """
        Compute spacing distances between well pairs by aligning TVD and laterally projecting
        midpoints in 2D to remove length-induced bias. Adds drill direction metadata and 
        relative cardinal direction from well_i to well_k.
        
        Returns:
            pd.DataFrame with spacing and directional metadata for all well pairs:
            ['well_i', 'well_k', 'horizontal_dist', 'vertical_dist', '3D_dist',
            'drill_direction_i', 'drill_direction_k', 'direction_from_i_to_k']
        """
        # 1. Get lateral midpoints and drill directions
        midpoint_df = self._compute_lateral_midpoints()
        drill_dirs = self._compute_drill_directions()
        midpoint_df["drill_direction"] = drill_dirs

        # 2. Prepare arrays
        ids = midpoint_df.index.to_numpy()
        coords = midpoint_df[["x", "y", "tvd"]].to_numpy()
        lat = midpoint_df["latitude"].to_numpy()
        lon = midpoint_df["longitude"].to_numpy()
        directions = midpoint_df["drill_direction"].to_numpy()
        n = len(ids)

        # 3. Pairwise indices
        i_idx, k_idx = self._get_pairwise_indices(n)

        # 4. Distances
        horizontal, vertical, dist3d = self._calculate_distances(
            coords, np.stack([lat, lon], axis=1), i_idx, k_idx, curvature_threshold_ft, use_geod
        )

        # 5. Relative cardinal direction from well_i to well_k
        direction_from_i_to_k = self._get_relative_cardinal_direction(lat, lon, i_idx, k_idx)

        # 6. Return final DataFrame
        return pd.DataFrame({
            "well_i": ids[i_idx],
            "well_k": ids[k_idx],
            "horizontal_dist": horizontal,
            "vertical_dist": vertical,
            "3D_dist": dist3d,
            "drill_direction_i": directions[i_idx],
            "drill_direction_k": directions[k_idx],
            "direction_from_i_to_k": direction_from_i_to_k,
        })
    
    def summarize_spacing_statistics(self, spacing_df: pd.DataFrame) -> Dict[str, float]:
        avg_spacing = spacing_df["3D_dist"].mean()
        spacing_var = spacing_df["3D_dist"].var()
        avg_horizontal = spacing_df["horizontal_dist"].mean()
        acres_per_well = avg_horizontal ** 2 / 43_560
        density = 1 / acres_per_well if acres_per_well > 0 else float("nan")
        return {
            "average_spacing_3D_ft": avg_spacing,
            "spacing_variance": spacing_var,
            "estimated_well_density_per_acre": density
        }

    def plot_spacing_cdf(self, spacing_df: pd.DataFrame, column: str = "3D_dist") -> None:
        values = np.sort(spacing_df[column].dropna())
        cdf = np.linspace(0, 1, len(values))
        plt.figure(figsize=(8, 5))
        plt.plot(values, cdf, label=f"CDF of {column}")
        plt.xlabel(f"{column} (ft)")
        plt.ylabel("Cumulative Probability")
        plt.title(f"CDF of Well {column.replace('_', ' ').title()}")
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.show()

    def plot_spacing_histogram(self, spacing_df: pd.DataFrame, column: str = "3D_dist", bins: int = 30) -> None:
        plt.figure(figsize=(8, 5))
        plt.hist(spacing_df[column].dropna(), bins=bins, edgecolor='k', alpha=0.7)
        plt.xlabel(f"{column} (ft)")
        plt.ylabel("Frequency")
        plt.title(f"Histogram of {column.replace('_', ' ').title()}")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    def print_spacing_percentiles(self, spacing_df: pd.DataFrame, column: str = "3D_dist") -> Dict[str, float]:
        percentiles = spacing_df[column].quantile([0.1, 0.5, 0.9]).to_dict()
        return {
            "P10": percentiles.get(0.1, float("nan")),
            "P50": percentiles.get(0.5, float("nan")),
            "P90": percentiles.get(0.9, float("nan"))
        }

    def filter_spacing_by_metadata(self, spacing_df: pd.DataFrame, metadata_df: pd.DataFrame, by: str = "operator") -> pd.DataFrame:
        group_map = metadata_df.set_index("uwi")[by].to_dict()
        spacing_df = spacing_df.copy()
        spacing_df["group_i"] = spacing_df["well_i"].map(group_map)
        spacing_df["group_k"] = spacing_df["well_k"].map(group_map)
        return spacing_df[spacing_df["group_i"] == spacing_df["group_k"]].drop(columns=["group_i", "group_k"])

    def group_spacing_summary(
        self,
        spacing_df: pd.DataFrame,
        metadata_df: pd.DataFrame,
        by: str = "operator",
        return_group_dataframes: bool = False
    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]]:
        """
        Computes group-level well spacing statistics by applying summarize_spacing_statistics
        to each group. Ensures consistent metric calculation and optionally returns group-level
        spacing DataFrames for downstream analysis or plotting.

        Parameters
        ----------
        spacing_df : pd.DataFrame
            DataFrame of well pair distances with columns:
            ['well_i', 'well_k', 'horizontal_dist', 'vertical_dist', '3D_dist']

        metadata_df : pd.DataFrame
            Metadata table mapping 'uwi' to grouping information like operator or basin.
            Must contain columns ['uwi', <by>].

        by : str, optional
            Column in metadata_df used for grouping, by default "operator".

        return_group_dataframes : bool, optional
            If True, also returns a dictionary of spacing DataFrames per group.

        Returns
        -------
        pd.DataFrame
            A DataFrame with spacing summaries per group, containing:
            ['average_spacing_3D_ft', 'spacing_variance', 'estimated_well_density_per_acre', 'count']

        Tuple[pd.DataFrame, Dict[str, pd.DataFrame]], optional
            If return_group_dataframes is True, also returns a dictionary mapping group names
            to filtered spacing DataFrames.
        """
        group_map = metadata_df.set_index("uwi")[by].to_dict()
        spacing_df = spacing_df.copy()
        spacing_df["group_i"] = spacing_df["well_i"].map(group_map)
        spacing_df["group_k"] = spacing_df["well_k"].map(group_map)

        # Filter for intra-group spacing pairs
        spacing_df = spacing_df[spacing_df["group_i"] == spacing_df["group_k"]]
        spacing_df["group"] = spacing_df["group_i"]
        spacing_df = spacing_df.drop(columns=["group_i", "group_k"])

        # Group by and apply summarize_spacing_statistics
        group_summaries = []
        group_dataframes = {}

        for group_name, group_df in spacing_df.groupby("group"):
            summary = self.summarize_spacing_statistics(group_df)
            summary[by] = group_name
            summary["count"] = len(group_df)
            group_summaries.append(summary)

            if return_group_dataframes:
                group_dataframes[group_name] = group_df.reset_index(drop=True)

        summary_df = pd.DataFrame(group_summaries)[[by, "average_spacing_3D_ft", "spacing_variance", "estimated_well_density_per_acre", "count"]]

        if return_group_dataframes:
            return summary_df, group_dataframes

        return summary_df


## 3. Loading Header and GeoSurvey either from Excel/csv/SQL into Pandas DataFrame

In [3]:
loader = WellDataLoader(db = DatabricksOdbcConnector(), 
                        log_dir=r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\src\logs")

In [5]:
df_MB_header = loader.get_header_data(basin="MB", start_year=2020)

[WellDataLoaderLogger] INFO (04-21 11:08 PM): Loading header data from SQL. (Line: 86) [well_data_manager.py]



  result_df = pd.read_sql(sql_query, self.connection)


In [6]:
df_MB_directional = loader.get_directional_data()

[WellDataLoaderLogger] INFO (04-21 11:12 PM): Loading directional data from SQL. (Line: 106) [well_data_manager.py]

  result_df = pd.read_sql(sql_query, self.connection)


In [8]:
processor = GeoSurveyProcessor(log_dir=r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\src\logs")

[GeoLogger] INFO (04-21 11:20 PM): GeoSurveyProcessor initialized. (Line: 187) [well_data_manager.py]



In [12]:
df_utm = processor.compute_utm_coordinates(df=df_MB_directional)

[GeoLogger] INFO (04-21 11:29 PM): ✅ Using lat/lon from input DataFrame. (Line: 263) [well_data_manager.py]

[GeoLogger] INFO (04-21 11:29 PM): ✅ UTM coordinate computation complete in 3.90 sec. (Line: 318) [well_data_manager.py]



In [13]:
df_utm_lateral = processor.filter_after_heel_point(df=df_utm)

In [17]:
spacing = WellSpacingCalculator(trajectories=df_utm_lateral)

In [19]:
df_ik_pairs = spacing.calculate_spacing_statistics()

In [20]:
df_ik_pairs

Unnamed: 0,well_i,well_k,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k
0,42003475690000,42003478570000,6085.055733,656.927104,6120.413098,NS,NS,W
1,42003475690000,42003478580000,6290.803568,655.050543,6324.816261,NS,NS,W
2,42003475690000,42003478590000,6455.667802,671.423277,6490.489657,NS,NS,W
3,42003475690000,42003478600000,6783.709784,666.123639,6816.336196,NS,NS,W
4,42003475690000,42003479040000,0.411817,1749.591867,1749.591915,NS,NS,W
...,...,...,...,...,...,...,...,...
160085751,42501375580000,42501375300000,14442.832976,41.323882,14442.892094,NS,NS,W
160085752,42501375580000,42501375310000,14124.734704,40.772406,14124.793551,NS,NS,W
160085753,42501375580000,42501375320000,13810.659118,39.173390,13810.714675,NS,NS,W
160085754,42501375580000,42501375330000,3345.284576,79.516769,3346.229492,NS,NS,N


In [24]:
df_ik_pairs[(df_ik_pairs['well_i'].isin(['42003478570000']))]

Unnamed: 0,well_i,well_k,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k
12652,42003478570000,42003475690000,6085.055733,656.927104,6120.413098,NS,NS,E
12653,42003478570000,42003478580000,318.525135,1.876561,318.530663,NS,NS,W
12654,42003478570000,42003478590000,645.542573,14.496173,645.705314,NS,NS,W
12655,42003478570000,42003478600000,988.351414,9.196535,988.394200,NS,NS,W
12656,42003478570000,42003479040000,0.400829,1092.664763,1092.664836,NS,NS,W
...,...,...,...,...,...,...,...,...
25299,42003478570000,42501375310000,0.959082,4274.444182,4274.444290,NS,NS,N
25300,42003478570000,42501375320000,0.957622,4276.043198,4276.043306,NS,NS,N
25301,42003478570000,42501375330000,0.889058,4394.733357,4394.733447,NS,NS,N
25302,42003478570000,42501375540000,0.960581,4294.617630,4294.617738,NS,NS,W
