# Spacing Statistics

## 1. Importing / Installing Packages

In [1]:
import os # Importing os module for operating system dependent functionality

import pandas as pd # Importing pandas package

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)

import numpy as np # Importing numpy package

from typing import Dict, Tuple, List, Union, Optional # Importing specific types from typing module

from src.database_manager import DatabricksOdbcConnector # Importing DatabricksOdbcConnector class from database_manager module

from pyproj import Geod # Importing Geod from pyproj package

from tqdm import tqdm # Importing tqdm for progress bar functionality

from joblib import Parallel, delayed # Importing Parallel and delayed for parallel processing

from matplotlib import pyplot as plt # Importing pyplot from matplotlib for plotting

# Setting matplotlib to inline mode for Jupyter notebooks
%matplotlib inline

%config InlineBackend.figure_format = 'svg' # Configuring inline backend to use SVG format for figures

from src.components.well_data_manager import WellDataLoader, GeoSurveyProcessor # Importing custom classes for well data management

from src.utils import reorder_columns # Importing utility function to reorder DataFrame columns

## 2. Defining Functions

### 2.1. Defining Functions that is used in calculation for i-k pair dataframe and Spacing Stats

In [15]:
class WellSpacingCalculator:
    """
    Class for calculating well spacing metrics and directional relationships using
    3D lateral midpoint alignment and curvature-aware distances.
    Midpoints are projected in 2D space to remove lateral-length bias when calculating spacing.
    """

    def __init__(self, trajectories: Union[Dict[str, pd.DataFrame], pd.DataFrame]):
        if isinstance(trajectories, pd.DataFrame):
            if "uwi" not in trajectories.columns:
                raise ValueError("Trajectory DataFrame must contain 'uwi' column.")
            self._trajectory_df = trajectories.reset_index(drop=True)
            self.trajectories = {
                cid: group for cid, group in self._trajectory_df.groupby("uwi")
            }
        elif isinstance(trajectories, dict):
            self.trajectories = trajectories
            self._trajectory_df = pd.concat(
                trajectories.values(), keys=trajectories.keys()
            ).reset_index(drop=True)
        else:
            raise ValueError("Invalid type for trajectories. Must be DataFrame or Dict.")

    def _apply_vectorized_geod(
        self,
        lat1: np.ndarray,
        lon1: np.ndarray,
        lat2: np.ndarray,
        lon2: np.ndarray
    ) -> np.ndarray:
        """
        Vectorized geodetic distance calculation between two arrays of (lat1, lon1) and (lat2, lon2).
        Returns distance in feet.
        """
        geod = Geod(ellps="WGS84")
        _, _, dist_m = geod.inv(lon1, lat1, lon2, lat2)
        dist_ft = dist_m * 3.28084  # Convert meters to feet
        return dist_ft

    def _compute_normalized_midpoints(self, frac: float = 0.5) -> pd.DataFrame:
        """
        Vectorized interpolation of normalized lateral midpoints (x/y/tvd/lat/lon)
        across all wells using relative lateral position (0–1).
        """

        df = self._trajectory_df.copy()
        df = df.sort_values(["uwi", "md"]).reset_index(drop=True)

        # Compute normalized MD (0.0 - 1.0) per well
        min_md = df.groupby("uwi")["md"].transform("min")
        max_md = df.groupby("uwi")["md"].transform("max")
        df["normalized_md"] = (df["md"] - min_md) / (max_md - min_md)

        # Get the row index within each well group
        df["row_index"] = df.groupby("uwi").cumcount()

        # Find the segment that surrounds the target frac
        df["prev_idx"] = df.groupby("uwi")["normalized_md"].transform(lambda x: x.searchsorted(frac, side="right") - 1)
        df["next_idx"] = df["prev_idx"] + 1

        # Clip next_idx so it does not exceed available rows
        df["next_idx"] = np.minimum(df["next_idx"], df["row_index"])

        # Extract only the prev and next rows
        df_prev = df[df["row_index"] == df["prev_idx"]].copy()
        df_next = df[df["row_index"] == df["next_idx"]].copy()

        # Merge prev and next rows for interpolation
        # Step 1: Ensure only one prev and next row per UWI
        df_prev = df.groupby("uwi").apply(
            lambda g: g.loc[g["row_index"] == g["prev_idx"].iloc[0]]
        ).reset_index(drop=True)

        df_next = df.groupby("uwi").apply(
            lambda g: g.loc[g["row_index"] == g["next_idx"].iloc[0]]
        ).reset_index(drop=True)

        # Step 2: Merge one-to-one
        merged = pd.merge(df_prev, df_next, on="uwi", suffixes=("_prev", "_next"))

        # Compute interpolation ratio
        delta = merged["normalized_md_next"] - merged["normalized_md_prev"]
        delta = delta.replace(0, np.nan)  # Avoid division by zero
        ratio = (frac - merged["normalized_md_prev"]) / delta

        # Interpolate coordinates
        result = {
            "x": merged["x_prev"] + ratio * (merged["x_next"] - merged["x_prev"]),
            "y": merged["y_prev"] + ratio * (merged["y_next"] - merged["y_prev"]),
            "tvd": merged["tvd_prev"] + ratio * (merged["tvd_next"] - merged["tvd_prev"]),
            "latitude": merged["latitude_prev"] + ratio * (merged["latitude_next"] - merged["latitude_prev"]),
            "longitude": merged["longitude_prev"] + ratio * (merged["longitude_next"] - merged["longitude_prev"]),
        }

        midpoint_df = pd.DataFrame(result)
        midpoint_df["uwi"] = merged["uwi"]
        return midpoint_df.set_index("uwi")

    def _compute_drill_directions(self) -> pd.Series:
        median_azimuth = self._trajectory_df.groupby("uwi")["azimuth"].median()
        is_ew = ((median_azimuth >= 45) & (median_azimuth <= 135)) | ((median_azimuth >= 225) & (median_azimuth <= 315))
        return pd.Series(np.where(is_ew, "EW", "NS"), index=median_azimuth.index, name="drill_direction")

    def _filter_close_pairs(self, lat: np.ndarray, lon: np.ndarray, max_distance_miles: float = 20.0) -> Tuple[np.ndarray, np.ndarray]:
        n = len(lat)
        lat1, lat2 = np.meshgrid(lat, lat, indexing="ij")
        lon1, lon2 = np.meshgrid(lon, lon, indexing="ij")

        delta_lat = np.abs(lat1 - lat2)
        delta_lon = np.abs(lon1 - lon2)

        miles_per_lat_degree = 69.0
        miles_per_lon_degree = 69.0 * np.cos(np.radians(lat))
        miles_per_lon_degree_matrix = np.add.outer(miles_per_lon_degree, miles_per_lon_degree) / 2.0

        rough_dist_miles = np.sqrt(
            (delta_lat * miles_per_lat_degree)**2 + (delta_lon * miles_per_lon_degree_matrix)**2
        )

        mask = (rough_dist_miles <= max_distance_miles) & (delta_lat + delta_lon > 0)
        i_idx, k_idx = np.where(mask)

        return i_idx, k_idx

    def _get_pairwise_indices(self, uwis: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate all valid pairwise (i, k) UWI combinations from an array of well IDs,
        excluding self-comparisons (i != k).

        Parameters
        ----------
        uwis : np.ndarray
            Array of unique well identifiers.

        Returns
        -------
        Tuple[np.ndarray, np.ndarray]
            Two 1D arrays of i and k UWIs representing all valid (i, k) pairs.
        """
        # Generate meshgrid of al possible UWI pairs
        n = len(uwis)
        i_idx, k_idx = np.meshgrid(np.arange(n), np.arange(n), indexing="ij")
            
        # Exclude self-comparisons (where i_uwi == k_uwi)
        valid_mask = i_idx != k_idx

        return i_idx[valid_mask], k_idx[valid_mask]
    
    def _get_relative_cardinal_direction(
        self,
        lat: np.ndarray,
        lon: np.ndarray,
        i_idx: np.ndarray,
        k_idx: np.ndarray
    ) -> np.ndarray:
        lat1, lat2 = lat[i_idx], lat[k_idx]
        lon1, lon2 = lon[i_idx], lon[k_idx]

        lat_diff = lat1 - lat2
        lon_diff = lon1 - lon2

        vertical = np.abs(lat_diff) > np.abs(lon_diff)
        is_south = lat_diff > 0
        is_west = lon_diff > 0

        return np.select(
            [vertical & is_south, vertical & ~is_south, ~vertical & is_west, ~vertical & ~is_west],
            ["S", "N", "W", "E"]
        )
    
    def _process_batch(self, i_idx: np.ndarray, k_idx: np.ndarray, ids: np.ndarray, coords: np.ndarray, lat_lon: np.ndarray, directions: np.ndarray, curvature_threshold_ft: float, use_geod: bool) -> pd.DataFrame:
        i_uwi = ids[i_idx]
        k_uwi = ids[k_idx]

        horizontal, vertical, dist3d = self._calculate_distances(coords, lat_lon, i_idx, k_idx, curvature_threshold_ft, use_geod)
        direction_from_i_to_k = self._get_relative_cardinal_direction(lat_lon[:, 0], lat_lon[:, 1], i_idx, k_idx)

        return pd.DataFrame({
            "well_i": i_uwi,
            "well_k": k_uwi,
            "horizontal_dist": horizontal,
            "vertical_dist": vertical,
            "3D_dist": dist3d,
            "drill_direction_i": directions[i_idx],
            "drill_direction_k": directions[k_idx],
            "direction_from_i_to_k": direction_from_i_to_k,
        })
    
    def _batch_filtered_indices(self, pairs: List[Tuple[int, int]], batch_size: int = 1_000_000):
        """
        Vectorized batching of prefiltered well pairs.

        Parameters
        ----------
        pairs : List[Tuple[int, int]]
            List of (i_idx, k_idx) pairs.
        batch_size : int
            Number of pairs per batch.

        Yields
        ------
        Tuple[np.ndarray, np.ndarray]
            i_idx and k_idx arrays for each batch.
        """
        pairs_array = np.array(pairs)  # Convert list of tuples directly to 2D array (N, 2)
        n_pairs = pairs_array.shape[0]

        # Vectorized slicing
        split_indices = np.arange(0, n_pairs, batch_size)

        for start_idx in split_indices:
            end_idx = min(start_idx + batch_size, n_pairs)
            batch = pairs_array[start_idx:end_idx]
            yield batch[:, 0], batch[:, 1]

    def _calculate_distances(
    self,
    coords: np.ndarray,
    lat_lon: np.ndarray,
    i_idx: np.ndarray,
    k_idx: np.ndarray,
    curvature_threshold_ft: float,
    use_geod: bool
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Calculates horizontal, vertical, and 3D distances between well pairs
        based on normalized midpoint coordinates.
        """
        A = coords[i_idx]  # shape (N, 3): x, y, tvd
        B = coords[k_idx]

        B_aligned = B.copy()

        B_aligned[:, 1] = A[:, 1]  # Force y of B to match y of A

        dx = B_aligned[:, 0] - A[:, 0]
        dy = B_aligned[:, 1] - A[:, 1]
        dz = B_aligned[:, 2] - A[:, 2]

        horizontal = np.sqrt(dx**2 + dy**2)
        vertical = np.abs(dz)
        dist3d = np.sqrt(horizontal**2 + vertical**2)

        if use_geod:
            long_mask = horizontal > curvature_threshold_ft
            if np.any(long_mask):
                lat = lat_lon[:, 0]
                lon = lat_lon[:, 1]
                lat1 = lat[i_idx[long_mask]]
                lon1 = lon[i_idx[long_mask]]
                lat2 = lat[k_idx[long_mask]]
                lon2 = lon[k_idx[long_mask]]

                # 🔥 Now call the optimized function!
                horizontal[long_mask] = self._apply_vectorized_geod(lat1, lon1, lat2, lon2)

                # Update 3D distance with corrected horizontal
                dist3d[long_mask] = np.sqrt(horizontal[long_mask]**2 + vertical[long_mask]**2)

        return horizontal, vertical, dist3d

    def calculate_spacing_statistics(
    self,
    curvature_threshold_ft: float = 26400.0,
    use_geod: bool = True,
    frac: float = 0.5,
    batch_size: int = 1_000_000,
    max_distance_miles: Optional[float] = 20.0,
    save_batches_dir: Optional[str] = None
) -> Optional[pd.DataFrame]:
        """
        Compute spacing distances between well pairs using normalized lateral midpoints.
        Removes length-induced bias and includes drill direction metadata.

        Parameters:
        -----------
        curvature_threshold_ft : float
            Use geodetic correction above this horizontal threshold (default 5 miles)
        use_geod : bool
            Whether to apply geodetic distance correction
        frac : float
            Fractional position along the lateral for midpoint interpolation (default 0.5)

        Returns:
        --------
        pd.DataFrame with spacing metrics:
            ['well_i', 'well_k', 'horizontal_dist', 'vertical_dist', '3D_dist',
            'drill_direction_i', 'drill_direction_k', 'direction_from_i_to_k']
        """
        # 1. Get normalized lateral midpoints and drill directions
        midpoint_df = self._compute_normalized_midpoints(frac=frac)
        drill_dirs = self._compute_drill_directions()
        midpoint_df["drill_direction"] = drill_dirs

        # 2. Prepare arrays
        ids = midpoint_df.index.to_numpy()
        coords = midpoint_df[["x", "y", "tvd"]].to_numpy()
        lat_lon = midpoint_df[["latitude", "longitude"]].to_numpy()
        directions = midpoint_df["drill_direction"].to_numpy()

        if max_distance_miles is not None:
            lat = lat_lon[:, 0]
            lon = lat_lon[:, 1]
            i_idx, k_idx = self._filter_close_pairs(lat, lon, max_distance_miles)
        else:
            i_idx, k_idx = self._get_pairwise_indices(ids)

        pairs = list(zip(i_idx, k_idx))
        batch_generator = list(self._batch_filtered_indices(pairs, batch_size=batch_size))
        n_batches = len(batch_generator)

        if save_batches_dir:
            os.makedirs(save_batches_dir, exist_ok=True)

        def process_and_save(batch_number: int, i_idx: np.ndarray, k_idx: np.ndarray):
            
            batch_df = self._process_batch(i_idx, k_idx, ids, coords, lat_lon, directions, curvature_threshold_ft, use_geod)
            
            if save_batches_dir:
                filepath = os.path.join(save_batches_dir, f"spacing_batch_{batch_number:04d}.parquet")
                batch_df.to_parquet(filepath, index=False)
            return batch_df
        
        tqdm_kwargs = {
            "desc": "🚀 Calculating Spacing (Parallel)",
            "dynamic_ncols": True, # Auto-adjust width to terminal
            "smoothing": 0.3, # Smoothing factor for progress bar
            "bar_format": "{desc}: |{bar:40}| {percentage:3.0f}% {n_fmt}/{total_fmt} [{elapsed}<{remaining}]", # Custom bar format
            "ascii": "░▒█", # Use custom ASCII characters for the bar
            "leave": True, # Leave the progress bar on completion
        }
        
        results = Parallel(n_jobs=-1)(
            delayed(process_and_save)(batch_num, i_idx, k_idx)
            for batch_num, (i_idx, k_idx) in tqdm(enumerate(batch_generator), total=n_batches, **tqdm_kwargs)
        )

        if save_batches_dir:
            print(f"✅ All batches saved to {save_batches_dir}")
            return None
        else:
            return pd.concat(results, ignore_index=True)

    def load_saved_batches(self, batch_folder: str) -> pd.DataFrame:
        """
        Load all saved spacing batch Parquet files from a folder and combine into a single DataFrame.

        Parameters
        ----------
        batch_folder : str
            Path to the folder where batch Parquet files are stored.

        Returns
        -------
        pd.DataFrame
            Combined spacing DataFrame.
        """
        if not os.path.isdir(batch_folder):
            raise FileNotFoundError(f"Batch folder '{batch_folder}' not found.")

        batch_files = sorted([
            os.path.join(batch_folder, f)
            for f in os.listdir(batch_folder)
            if f.endswith(".parquet")
        ])

        if not batch_files:
            raise ValueError(f"No Parquet files found in folder '{batch_folder}'.")

        print(f"🔍 Found {len(batch_files)} batch files. Loading and combining...")

        dfs = []
        for file in batch_files:
            dfs.append(pd.read_parquet(file))

        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"✅ Loaded {len(combined_df):,} rows from all batches.")
        return combined_df

    def summarize_spacing_statistics(self, spacing_df: pd.DataFrame) -> Dict[str, float]:
        avg_spacing = spacing_df["3D_dist"].mean()
        spacing_var = spacing_df["3D_dist"].var()
        avg_horizontal = spacing_df["horizontal_dist"].mean()
        acres_per_well = avg_horizontal ** 2 / 43_560
        density = 1 / acres_per_well if acres_per_well > 0 else float("nan")
        return {
            "average_spacing_3D_ft": avg_spacing,
            "spacing_variance": spacing_var,
            "estimated_well_density_per_acre": density
        }

    def plot_spacing_cdf(self, spacing_df: pd.DataFrame, column: str = "3D_dist") -> None:
        values = np.sort(spacing_df[column].dropna())
        cdf = np.linspace(0, 1, len(values))
        plt.figure(figsize=(8, 5))
        plt.plot(values, cdf, label=f"CDF of {column}")
        plt.xlabel(f"{column} (ft)")
        plt.ylabel("Cumulative Probability")
        plt.title(f"CDF of Well {column.replace('_', ' ').title()}")
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.show()

    def plot_spacing_histogram(self, spacing_df: pd.DataFrame, column: str = "3D_dist", bins: int = 30) -> None:
        plt.figure(figsize=(8, 5))
        plt.hist(spacing_df[column].dropna(), bins=bins, edgecolor='k', alpha=0.7)
        plt.xlabel(f"{column} (ft)")
        plt.ylabel("Frequency")
        plt.title(f"Histogram of {column.replace('_', ' ').title()}")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    def print_spacing_percentiles(self, spacing_df: pd.DataFrame, column: str = "3D_dist") -> Dict[str, float]:
        percentiles = spacing_df[column].quantile([0.1, 0.5, 0.9]).to_dict()
        return {
            "P10": percentiles.get(0.1, float("nan")),
            "P50": percentiles.get(0.5, float("nan")),
            "P90": percentiles.get(0.9, float("nan"))
        }

    def filter_spacing_by_metadata(self, spacing_df: pd.DataFrame, metadata_df: pd.DataFrame, by: str = "operator") -> pd.DataFrame:
        group_map = metadata_df.set_index("uwi")[by].to_dict()
        spacing_df = spacing_df.copy()
        spacing_df["group_i"] = spacing_df["well_i"].map(group_map)
        spacing_df["group_k"] = spacing_df["well_k"].map(group_map)
        return spacing_df[spacing_df["group_i"] == spacing_df["group_k"]].drop(columns=["group_i", "group_k"])

    def group_spacing_summary(
        self,
        spacing_df: pd.DataFrame,
        metadata_df: pd.DataFrame,
        by: str = "operator",
        return_group_dataframes: bool = False
    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]]:
        """
        Computes group-level well spacing statistics by applying summarize_spacing_statistics
        to each group. Ensures consistent metric calculation and optionally returns group-level
        spacing DataFrames for downstream analysis or plotting.

        Parameters
        ----------
        spacing_df : pd.DataFrame
            DataFrame of well pair distances with columns:
            ['well_i', 'well_k', 'horizontal_dist', 'vertical_dist', '3D_dist']

        metadata_df : pd.DataFrame
            Metadata table mapping 'uwi' to grouping information like operator or basin.
            Must contain columns ['uwi', <by>].

        by : str, optional
            Column in metadata_df used for grouping, by default "operator".

        return_group_dataframes : bool, optional
            If True, also returns a dictionary of spacing DataFrames per group.

        Returns
        -------
        pd.DataFrame
            A DataFrame with spacing summaries per group, containing:
            ['average_spacing_3D_ft', 'spacing_variance', 'estimated_well_density_per_acre', 'count']

        Tuple[pd.DataFrame, Dict[str, pd.DataFrame]], optional
            If return_group_dataframes is True, also returns a dictionary mapping group names
            to filtered spacing DataFrames.
        """
        group_map = metadata_df.set_index("uwi")[by].to_dict()
        spacing_df = spacing_df.copy()
        spacing_df["group_i"] = spacing_df["well_i"].map(group_map)
        spacing_df["group_k"] = spacing_df["well_k"].map(group_map)

        # Filter for intra-group spacing pairs
        spacing_df = spacing_df[spacing_df["group_i"] == spacing_df["group_k"]]
        spacing_df["group"] = spacing_df["group_i"]
        spacing_df = spacing_df.drop(columns=["group_i", "group_k"])

        # Group by and apply summarize_spacing_statistics
        group_summaries = []
        group_dataframes = {}

        for group_name, group_df in spacing_df.groupby("group"):
            summary = self.summarize_spacing_statistics(group_df)
            summary[by] = group_name
            summary["count"] = len(group_df)
            group_summaries.append(summary)

            if return_group_dataframes:
                group_dataframes[group_name] = group_df.reset_index(drop=True)

        summary_df = pd.DataFrame(group_summaries)[[by, "average_spacing_3D_ft", "spacing_variance", "estimated_well_density_per_acre", "count"]]

        if return_group_dataframes:
            return summary_df, group_dataframes

        return summary_df


## 3. Loading Header and GeoSurvey either from Excel/csv/SQL into Pandas DataFrame

In [3]:
loader = WellDataLoader(db = DatabricksOdbcConnector(), 
                        log_dir=r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\src\logs")

In [4]:
df_MB_header = loader.get_header_data(basin="MB", start_year=2016)

[WellDataLoaderLogger] INFO (04-28 11:12 PM): Loading header data from SQL. (Line: 84) [well_data_manager.py]

  result_df = pd.read_sql(sql_query, self.connection)


In [5]:
df_MB_header.shape

(20973, 11)

In [6]:
df_MB_directional = loader.get_directional_data()

[WellDataLoaderLogger] INFO (04-28 11:12 PM): Loading directional data from SQL. (Line: 104) [well_data_manager.py]



In [7]:
processor = GeoSurveyProcessor(log_dir=r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\src\logs")

[GeoLogger] INFO (04-28 11:13 PM): GeoSurveyProcessor initialized. (Line: 185) [well_data_manager.py]



In [8]:
df_utm = processor.compute_utm_coordinates(df=df_MB_directional)

[GeoLogger] INFO (04-28 11:13 PM): ✅ Using lat/lon from input DataFrame. (Line: 261) [well_data_manager.py]

[GeoLogger] INFO (04-28 11:13 PM): ✅ UTM coordinate computation complete in 3.23 sec. (Line: 316) [well_data_manager.py]



In [9]:
df_utm_lateral = processor.filter_after_heel_point(df=df_utm)

In [16]:
spacing_calculator = WellSpacingCalculator(trajectories=df_utm_lateral)

In [None]:
spacing_calculator.calculate_spacing_statistics(
    batch_size=500_000,
    max_distance_miles=10,  # Adjust as needed for your dataset
    save_batches_dir = "spacing_batches_MB"  # Specify directory to save batches
)

  df_prev = df.groupby("uwi").apply(
  df_next = df.groupby("uwi").apply(
🚀 Calculating Spacing (Parallel): |████████████████████████████████████████| 100% 56/56 [00:10<00:00]


✅ All batches saved to spacing_batches_MB


In [17]:
df_spacing = spacing_calculator.load_saved_batches(batch_folder="spacing_batches_MB")

🔍 Found 56 batch files. Loading and combining...
✅ Loaded 27,557,842 rows from all batches.


In [19]:
df_spacing.head()

Unnamed: 0,well_i,well_k,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k
0,42003463520000,42003469560000,873.86827,516.898923,1015.298109,NS,NS,N
1,42003463520000,42003469570000,1757.065726,514.675699,1830.893509,NS,NS,N
2,42003463520000,42003472010000,3185.718838,491.693464,3223.440239,NS,NS,N
3,42003463520000,42003472020000,3730.831979,474.098756,3760.834599,NS,NS,N
4,42003463520000,42003472040000,3611.658703,554.267689,3653.94188,NS,NS,N


In [20]:
df_spacing['well_i_10'] = df_spacing['well_i'].str[:10] # Extracting first 10 characters of UWI
df_spacing['well_k_10'] = df_spacing['well_k'].str[:10] # Extracting first 10 characters of UWI

In [21]:
df_spacing = reorder_columns(df_spacing, ['well_i_10','well_k_10'], 'well_k')

In [22]:
df_spacing.head(10) # Displaying the first 10 rows of the DataFrame

Unnamed: 0,well_i,well_k,well_i_10,well_k_10,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k
0,42003463520000,42003469560000,4200346352,4200346956,873.86827,516.898923,1015.298109,NS,NS,N
1,42003463520000,42003469570000,4200346352,4200346957,1757.065726,514.675699,1830.893509,NS,NS,N
2,42003463520000,42003472010000,4200346352,4200347201,3185.718838,491.693464,3223.440239,NS,NS,N
3,42003463520000,42003472020000,4200346352,4200347202,3730.831979,474.098756,3760.834599,NS,NS,N
4,42003463520000,42003472040000,4200346352,4200347204,3611.658703,554.267689,3653.94188,NS,NS,N
5,42003463520000,42003472050000,4200346352,4200347205,2864.797617,560.686359,2919.149633,NS,NS,N
6,42003463520000,42003472080000,4200346352,4200347208,10635.171205,470.840105,10645.588615,NS,NS,E
7,42003463520000,42003472090000,4200346352,4200347209,9855.440111,468.988108,9866.592604,NS,NS,E
8,42003463520000,42003472100000,4200346352,4200347210,9096.110829,462.874251,9107.880367,NS,NS,E
9,42003463520000,42003472930100,4200346352,4200347293,1829.946825,494.491531,1895.580981,NS,NS,W


In [27]:
df_spacing[df_spacing['well_i_10']=='4200347569'].sort_values(by='horizontal_dist')

Unnamed: 0,well_i,well_k,well_i_10,well_k_10,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k
99089,42003475690000,42003480120000,4200347569,4200348012,15.914009,593.452017,593.665354,NS,NS,S
99279,42003475690000,42003483980000,4200347569,4200348398,45.767421,709.216370,710.691576,NS,NS,S
99090,42003475690000,42003480140000,4200347569,4200348014,48.672794,1121.587187,1122.642802,NS,NS,S
99025,42003475690000,42003475680000,4200347569,4200347568,141.190548,649.014891,664.195076,NS,NS,W
99276,42003475690000,42003483930000,4200347569,4200348393,200.295670,61.186998,209.433054,NS,NS,S
...,...,...,...,...,...,...,...,...,...,...
99808,42003475690000,42317424020000,4200347569,4231742402,52655.430653,89.677975,52655.507019,NS,NS,E
99940,42003475690000,42317436360100,4200347569,4231743636,52690.267442,282.922096,52691.027016,NS,NS,E
99711,42003475690000,42317416820000,4200347569,4231741682,52739.138637,312.424224,52740.064023,NS,NS,E
99878,42003475690000,42317430390000,4200347569,4231743039,52846.229058,323.600831,52847.219824,NS,NS,E


In [24]:
df_spacing[(df_spacing['well_i'] == '42003475690000') & (df_spacing['direction_from_i_to_k'].isin(['E','W']))].sort_values(by='horizontal_dist')

Unnamed: 0,well_i,well_k,well_i_10,well_k_10,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k
99025,42003475690000,42003475680000,4200347569,4200347568,141.190548,649.014891,664.195076,NS,NS,W
99105,42003475690000,42003481150000,4200347569,4200348115,491.580345,218.450853,537.933091,NS,NS,E
99071,42003475690000,42003478800000,4200347569,4200347880,919.047036,671.859821,1138.438876,NS,NS,W
99106,42003475690000,42003481160000,4200347569,4200348116,977.415255,8.525432,977.452435,NS,NS,E
99238,42003475690000,42003483420000,4200347569,4200348342,1281.950738,40.836975,1282.601011,NS,NS,W
...,...,...,...,...,...,...,...,...,...,...
99808,42003475690000,42317424020000,4200347569,4231742402,52655.430653,89.677975,52655.507019,NS,NS,E
99940,42003475690000,42317436360100,4200347569,4231743636,52690.267442,282.922096,52691.027016,NS,NS,E
99711,42003475690000,42317416820000,4200347569,4231741682,52739.138637,312.424224,52740.064023,NS,NS,E
99878,42003475690000,42317430390000,4200347569,4231743039,52846.229058,323.600831,52847.219824,NS,NS,E


In [25]:
df_spacing[(df_spacing['well_i'].str.startswith('4200347569')) & (df_spacing['well_k'].str.startswith('4200347568'))]

Unnamed: 0,well_i,well_k,well_i_10,well_k_10,horizontal_dist,vertical_dist,3D_dist,drill_direction_i,drill_direction_k,direction_from_i_to_k
99025,42003475690000,42003475680000,4200347569,4200347568,141.190548,649.014891,664.195076,NS,NS,W
