In [1]:
import pandas as pd # Importing pandas package

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)

import numpy as np # Importing numpy package

from typing import Dict, Tuple, List, Union, Optional # Importing specific types from typing module

import re # Importing regular expression package

from src.database_manager import DatabricksOdbcConnector # Importing DatabricksOdbcConnector class from database_manager module
from src.utils import reorder_columns # Importing reorder_columns function from utils module

from scipy.spatial.distance import cdist # Importing cdist function from scipy package

import time # Importing Time Module

import pyproj # Importing pyproj package

from src.custom_logger import CustomLogger # Importing CustomLogger class from custom

import os #

In [None]:
class SpacingIKPairs:
    """
    Class for identifying spacing IK pairs in a given dataset.
    """

    def __init__(
        self, 
        db: DatabricksOdbcConnector, 
        header_source: Optional[Union[str, pd.DataFrame]] = None,
        log_dir: str = "./logs"
    ):
        """
        Initializes the SpacingIKPairs class with a database connection and table name.

        Args:
            db (DatabricksOdbcConnector): Database connection object.
            header_source (Optional[Union[str, pd.DataFrame]]): Source for header data.
                - If a string, should be a file path (Excel or CSV).
                - If a DataFrame, uses it directly.
                - If None, retrieves header data from SQL.
            log_dir (str): Directory for log files.
        """
        self.db = db  # Database connection
        self.logger = CustomLogger("spacing_ik_pairs", "SpacingIKLogger", log_dir).get_logger()  # Custom logger
        
        if isinstance(header_source, pd.DataFrame):
            self.header_df = header_source
            self.logger.info("Initialized with provided DataFrame.")
        elif isinstance(header_source, str) and os.path.exists(header_source):
            self.header_df = self.load_header_from_file(header_source)
            self.logger.info(f"Header DataFrame loaded from file: {header_source}")
        else:
            self.logger.info("No valid header source provided. Querying from Databricks SQL...")
            self.header_df = self.get_header_data_frm_DataBricks()
            self.logger.info("Header DataFrame loaded from SQL.")

    def load_header_from_file(self, file_path: str) -> pd.DataFrame:
        """
        Loads header data from an Excel or CSV file.

        Args:
            file_path (str): Path to the Excel or CSV file.
        
        Returns:
            pd.DataFrame: Loaded header data.
        """
        try:
            if file_path.endswith(".xlsx") or file_path.endswith(".xls"):
                df = pd.read_excel(file_path)
            elif file_path.endswith(".csv"):
                df = pd.read_csv(file_path)
            else:
                raise ValueError("Unsupported file format. Provide an Excel or CSV file.")
            
            # Check required columns
            required_columns = ["chosen_id", "lease_name", "well_name", "rsv_cat", "bench", "first_prod_date", "hole_direction"]
            missing_columns = [col for col in required_columns if col not in self.df.columns]

            if missing_columns:
                self.logger.error(f"Missing columns: {missing_columns}")
                raise ValueError(f"Header DataFrame is missing required columns: {missing_columns}")
            
            return df

        except Exception as e:
            self.logger.error(f"Error loading header file: {e}")
            raise

    def get_header_data_frm_DataBricks(self) -> pd.DataFrame:
        """
        Retrieves header data from the databricks.

        Returns:
            pd.DataFrame: DataFrame containing the header data from databricks.
        """
        try:
            self.db.connect()

            query = f"""
            SELECT
                api10 AS chosen_id, 
                leaseName as lease_name,
                wellName as well_name,
                customString2 as rsv_cat,
                customString0 as bench,
                date(firstProdDate) as first_prod_date,
                holeDirection as hole_direction
                
            FROM Combocurve.export.wells
            WHERE basin = 'MB' 
            and holeDirection = 'H' 
            and year(date(firstProdDate)) >= 2019;
            """

            return self.db.execute_query(query)

        except Exception as e:
            self.logger.error(f"Error retrieving header data from databricks: {e}")
        finally:
            self.db.close_connection()        

    def get_directional_survey_data(self) -> pd.DataFrame:
        """
        Retrieves directional data from the databricks.

        Returns:
            pd.DataFrame: DataFrame containing the directional data from databricks.
        """
        # Get the unique chosen_ids for horizontal wells only
        chosen_ids = ", ".join(f"'{id}'" for id in self.header_df[self.header_df['hole_direction']=='H']['chosen_id'].unique())

        try:
            self.db.connect()

            query = f"""
            SELECT
                LEFT(uwi, 10) AS chosen_id, 
                station_md_uscust AS md, 
                station_tvd_uscust AS tvd,
                inclination, 
                azimuth, 
                latitude, 
                longitude, 
                x_offset_uscust AS `deviation_E/W`,
                ew_direction,
                y_offset_uscust AS `deviation_N/S`,
                ns_direction,
                point_type
                
            FROM ihs_sp.well.well_directional_survey_station
            WHERE LEFT(uwi, 10) IN ({chosen_ids})
            order by uwi, md;
            """

            return self.db.execute_query(query)

        except Exception as e:
            self.logger.error(f"Error retrieving directional data from databricks: {e}")
        finally:
            self.db.close_connection()

    def determine_utm_zone(self, longitude: float) -> int:
        """
        Determines the UTM zone based on a given longitude.
        """
        return int((longitude + 180) / 6) + 1
    
    def batch_latlon_to_utm(self, lat: np.ndarray, lon: np.ndarray, utm_zone: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Converts arrays of latitudes and longitudes to UTM coordinates in meters for a given UTM zone.
        """
        proj_utm = pyproj.Transformer.from_crs(
            "EPSG:4326", f"EPSG:326{utm_zone}", always_xy=True
        )
        
        return proj_utm.transform(lon, lat)
    
    def compute_mean_elevation(self,df: pd.DataFrame) -> pd.DataFrame:
        """
        Computes the mean elevation (mean z value) for each ChosenID.

        Parameters:
        - df (pd.DataFrame): DataFrame containing 'ChosenID' and 'z' columns.

        Returns:
        - pd.DataFrame: DataFrame with 'ChosenID' and corresponding mean 'z' values.
        """
        mean_z_df = df.groupby("ChosenID", as_index=False)["z"].mean()
        mean_z_df.rename(columns={"z": "elevation"}, inplace=True)
        return mean_z_df

    def compute_utm_coordinates(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Computes UTM (x, y, z) coordinates for multiple wells, using surface location to determine UTM zones.
        Converts UTM coordinates from meters to feet. Uses vectorized batch processing for performance.

        Parameters:
        - df (pd.DataFrame): Original directional survey DataFrame.

        Returns:
        - pd.DataFrame: DataFrame with all original columns + x, y, z (in feet), and utm_zone.
        """
        start_time = time.time()  # Start timing

        # Step 1: Sort dataframe by md to identify surface location
        df = df.sort_values(by=["chosen_id", "md"], ascending=[True, True])
        
        # Step 2: Determine UTM zones using the surface location (first row per well)
        surface_locs = df.groupby("chosen_id").first()[["latitude", "longitude"]]
        surface_locs["utm_zone"] = surface_locs["longitude"].apply(self.determine_utm_zone)

        # Merge UTM zones back into the original dataframe
        df = df.merge(surface_locs[["utm_zone"]], on="chosen_id", how="left")

        self.logger.info(f"✅ Determined UTM zones in {time.time() - start_time:.4f} seconds.")

        # Step 3: Batch transformation for each unique UTM zone
        start_transform_time = time.time()
        unique_zones = df["utm_zone"].unique()
        utm_converters: Dict[int, Tuple[np.ndarray, np.ndarray]] = {}

        for zone in unique_zones:
            subset = df[df["utm_zone"] == zone]
            easting, northing = self.batch_latlon_to_utm(subset["latitude"].values, subset["longitude"].values, zone)
            utm_converters[zone] = (easting, northing)

        self.logger.info(f"✅ Performed batch EPSG transformations in {time.time() - start_transform_time:.4f} seconds.")

        # Step 4: Assign the converted coordinates back to the DataFrame
        start_assign_time = time.time()
        df["x"], df["y"] = np.zeros(len(df)), np.zeros(len(df))

        for zone in unique_zones:
            mask = df["utm_zone"] == zone
            df.loc[mask, "x"], df.loc[mask, "y"] = utm_converters[zone]

        self.logger.info(f"✅ Assigned transformed coordinates in {time.time() - start_assign_time:.4f} seconds.")

        # Step 5: Convert UTM coordinates from meters to feet (Conversion factor: 1 meter = 3.28084 feet)
        df["x"] *= 3.28084
        df["y"] *= 3.28084
        
        df["z"] = -df["tvd"] # Elevation is negative TVD

        self.logger.info(f"✅ Total execution time: {time.time() - start_time:.4f} seconds.")

        return df
    
    def filter_after_heel_point(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Filters the dataframe to include all rows for each chosen_id where the first occurrence 
        of either '80' or 'heel' appears in the point_type column and all subsequent rows.

        Parameters:
        df (pd.DataFrame): A dataframe containing directional survey data with a 'chosen_id' column and 'point_type' column.

        Returns:
        pd.DataFrame: Filtered dataframe containing rows from the first occurrence of '80' or 'heel' onward.
        """

        # Convert 'point_type' to lowercase and check for '80' or 'heel'
        mask = df['point_type'].str.lower().str.contains(r'80|heel', regex=True, na=False)

        # Identify the first occurrence for each chosen_id
        idx_start = df[mask].groupby('chosen_id', sort=False).head(1).index

        # Create a mapping of chosen_id to the starting index
        start_idx_map = dict(zip(df.loc[idx_start, 'chosen_id'], idx_start))

        # Create a boolean mask using NumPy to filter rows
        chosen_ids = df['chosen_id'].values
        indices = np.arange(len(df))

        # Get the minimum start index for each row's chosen_id
        start_indices = np.vectorize(start_idx_map.get, otypes=[float])(chosen_ids)

        # Mask rows where index is greater than or equal to the start index
        valid_rows = indices >= start_indices

        return df[valid_rows].reset_index(drop=True)
    
    def extract_heel_toe_mid_lat_lon(self, well_trajectory: pd.DataFrame) -> pd.DataFrame:
        """
        Extract the heel, toe, and mid-point latitude/longitude for each chosen_id in the well trajectory DataFrame.

        Parameters:
        well_trajectory: pd.DataFrame
            DataFrame containing well trajectory data, including 'chosen_id', 'md', 'latitude', and 'longitude'.

        Returns:
        pd.DataFrame
            A DataFrame with 'chosen_id', 'Heel_Lat', 'Heel_Lon', 'Toe_Lat', 'Toe_Lon', 'Mid_Lat', 'Mid_Lon'.

        Example:
        >>> data = {
        ...     "chosen_id": [1001, 1001, 1001, 1002, 1002],
        ...     "md": [5000, 5100, 5200, 6000, 6100],
        ...     "latitude": [31.388, 31.389, 31.387, 31.400, 31.401],
        ...     "longitude": [-103.314, -103.315, -103.316, -103.318, -103.319]
        ... }
        >>> df = pd.DataFrame(data)
        >>> extract_heel_toe_mid_lat_lon(df)
        chosen_id  Heel_Lat  Heel_Lon  Toe_Lat  Toe_Lon  Mid_Lat  Mid_Lon
        0     1001    31.388  -103.314   31.387  -103.316  31.3875 -103.315
        1     1002    31.400  -103.318   31.401  -103.319  31.4005 -103.3185
        """
        # Ensure the data is sorted by MD in ascending order
        well_trajectory = well_trajectory.sort_values(by=["chosen_id", "md"], ascending=True)

        # Group by 'chosen_id' and extract heel/toe lat/lon
        heel_toe_df = (
            well_trajectory.groupby("chosen_id")
            .agg(
                heel_lat=("latitude", "first"),
                heel_lon=("longitude", "first"),
                toe_lat=("latitude", "last"),
                toe_lon=("longitude", "last"),
            )
            .reset_index()
        )

        # Calculate midpoints
        heel_toe_df["mid_Lat"] = (heel_toe_df["heel_lat"] + heel_toe_df["toe_lat"]) / 2
        heel_toe_df["mid_Lon"] = (heel_toe_df["heel_lon"] + heel_toe_df["toe_lon"]) / 2

        return heel_toe_df
    
    def get_direction(self, lat1: np.ndarray, lon1: np.ndarray, lat2: np.ndarray, lon2: np.ndarray) -> np.ndarray:
        """
        Determine the relative direction of (lat2, lon2) with respect to (lat1, lon1).
        
        Parameters:
        lat1, lon1: np.ndarray
            Latitude and longitude of the first well.
        lat2, lon2: np.ndarray
            Latitude and longitude of the second well.
        
        Returns:
        np.ndarray
            Array indicating the direction (e.g., North, South, East, West) of well B relative to well A.
        """
        lat_diff = lat2 - lat1
        lon_diff = lon2 - lon1

        conditions = [
            np.abs(lat_diff) > np.abs(lon_diff),
            lat_diff > 0,
            lon_diff > 0
        ]

        choices = ["N", "S", "E", "W"]
        
        return np.select(
            [conditions[0] & conditions[1], conditions[0] & ~conditions[1], ~conditions[0] & conditions[2], ~conditions[0] & ~conditions[2]],
            choices
        )
    
    def calculate_drill_direction_vectorized(self, well_trajectories: Dict[str, pd.DataFrame], i_indices: np.ndarray) -> np.ndarray:
        """
        Optimized vectorized function to determine the drilling direction of multiple wells using NumPy operations.
        
        Parameters:
        well_trajectories: Dict[str, pd.DataFrame]
            Dictionary containing well trajectory data indexed by chosen_id.
        i_indices: np.ndarray
            Array of chosen_id whose drill directions need to be calculated.
        
        Returns:
        np.ndarray
            Array containing "EW" (East-West) or "NS" (North-South) for each well.
        """
        start_time = time.time()

        # 🚀 Precompute medians for all wells at once
        all_data = pd.concat(well_trajectories.values(), keys=well_trajectories.keys()).reset_index(level=0)
        azimuth_medians = all_data.groupby("level_0")["azimuth"].median().to_dict()

        # 🚀 Fast lookup using NumPy
        azimuth_values = np.array([azimuth_medians.get(i, np.nan) for i in i_indices])

        # 🚀 Apply vectorized conditions
        conditions = (45 <= azimuth_values) & (azimuth_values < 135) | (225 <= azimuth_values) & (azimuth_values < 315)
        drill_directions = np.where(np.isnan(azimuth_values), "Unknown", np.where(conditions, "EW", "NS"))

        return drill_directions
    
    def calculate_3D_distance_matrix(self,
        trajectories: dict[str, pd.DataFrame], 
        i_indices: np.ndarray, 
        k_indices: np.ndarray,
        threshold_feet: float = 20_000.0  # User-defined threshold for switching to Haversine
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Compute horizontal, vertical, and 3D distances for well pairs in feet.
        
        Automatically switches between Euclidean and Haversine distance based on proximity.

        Parameters:
        -----------
        trajectories : Dict[str, pd.DataFrame]
            Dictionary of well trajectories indexed by well ID.
            Should include columns for both (x, y) UTM coordinates and (latitude, longitude).
        i_indices : np.ndarray
            Array of well IDs for the first well in each pair.
        k_indices : np.ndarray
            Array of well IDs for the second well in each pair.

        Returns:
        --------
        Tuple[np.ndarray, np.ndarray, np.ndarray]
            - Horizontal distances between the well pairs (in feet).
            - Vertical distances between the well pairs (in feet).
            - 3D distances between the well pairs (in feet).
        """

        # Conversion factors
        KM_TO_FEET = 3280.84  # 1 km = 3280.84 feet
        EARTH_RADIUS_FEET = 6371 * KM_TO_FEET  # Convert Earth's radius to feet

        # Combine all trajectories into a single DataFrame
        all_trajectories_df = pd.concat(trajectories.values(), keys=trajectories.keys()).reset_index(drop=True)

        # Compute midpoints for each well using UTM (x, y) and Lat/Long
        midpoints_df = all_trajectories_df.groupby("chosen_id")[["x", "y", "latitude", "longitude", "tvd"]].mean()

        # Convert to NumPy arrays for fast lookup
        well_ids = midpoints_df.index.to_numpy()
        midpoints = midpoints_df.to_numpy()

        # Create a mapping from well ID to its index
        well_id_to_idx = {well_id: idx for idx, well_id in enumerate(well_ids)}

        # Extract midpoints for well pairs using vectorized NumPy indexing
        mid_A = midpoints[np.array([well_id_to_idx[i] for i in i_indices])]
        mid_B = midpoints[np.array([well_id_to_idx[k] for k in k_indices])]

        # Compute vertical distances (TVD differences) in feet
        vertical_distances = np.abs(mid_A[:, 4] - mid_B[:, 4])  # TVD column index = 4

        # Align Well B's TVD to Well A for horizontal distance calculation
        mid_B[:, 4] = mid_A[:, 4]

        # **Compute Euclidean horizontal distances using (x, y) UTM coordinates**
        euclidean_horizontal_distances = np.linalg.norm(mid_A[:, :2] - mid_B[:, :2], axis=1)

        # Initialize final horizontal distances array
        horizontal_distances = np.zeros_like(euclidean_horizontal_distances)

        # Identify indices where Euclidean distance exceeds the threshold (use Haversine)
        use_haversine_indices = np.where(euclidean_horizontal_distances >= threshold_feet)[0]

        if use_haversine_indices.size > 0:
            # Convert (latitude, longitude) to radians
            lat1, lon1 = np.radians(mid_A[:, 2]), np.radians(mid_A[:, 3])
            lat2, lon2 = np.radians(mid_B[:, 2]), np.radians(mid_B[:, 3])

            dlat = lat2 - lat1
            dlon = lon2 - lon1

            a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

            haversine_distances = EARTH_RADIUS_FEET * c  # Convert to feet

            # Assign Haversine distances only where needed
            horizontal_distances[use_haversine_indices] = haversine_distances[use_haversine_indices]

        # Assign Euclidean distances for distances below the threshold
        horizontal_distances[euclidean_horizontal_distances < threshold_feet] = euclidean_horizontal_distances[euclidean_horizontal_distances < threshold_feet]

        # Compute full 3D distances using the Pythagorean theorem
        total_3D_distances = np.sqrt(horizontal_distances**2 + vertical_distances**2)

        return horizontal_distances, vertical_distances, total_3D_distances
    
    def create_i_k_pairs(self, trajectories: Union[Dict[str, pd.DataFrame], pd.DataFrame]) -> pd.DataFrame:
        """
        Generate the i_k_pairs DataFrame, computing horizontal and vertical distances, 
        3D distances, drilling directions, and relative directions between well pairs.
        
        Parameters:
        trajectories: Union[Dict[str, pd.DataFrame], pd.DataFrame]
            Either:
            - A dictionary mapping well IDs ("chosen_id") to trajectory DataFrames.
            - A single DataFrame containing all trajectory data (must have "chosen_id" column).
            
        Each trajectory DataFrame should include:
        - "md" (float): Measured depth.
        - "tvd" (float): True vertical depth.
        - "inclination" (float): Inclination angle in degrees.
        - "azimuth" (float): represents the drilling direction.
        - "latitude" (float): Latitude values, define the geographical position.
        - "longitude" (float): Longitude values, define the geographical position.
        - "x" (float): X-coordinate in a Cartesian coordinate system.
        - "y" (float): Y-coordinate in a Cartesian coordinate system.
        - "z" (float): Z-coordinate in a Cartesian coordinate system (elevation).
        
        Returns:
        pd.DataFrame
            DataFrame containing pairs of wells (`i_uwi`, `k_uwi`) with their computed distances 
            and directional relationships.
        """
        start_time = time.time()

        df = self.header_df
        
        # Convert to dictionary if input is a DataFrame
        step1_start = time.time()
        if isinstance(trajectories, pd.DataFrame):
            if "chosen_id" not in trajectories.columns:
                self.logger.error("🚨 Error: Trajectory DataFrame must contain a 'chosen_id' column.")
                raise ValueError("🚨 Error: Trajectory DataFrame must contain a 'chosen_id' column.")
            trajectories = {cid: group for cid, group in trajectories.groupby("chosen_id")}
        step1_end = time.time()
        self.logger.info(f"✅ Step 1: Converted trajectory DataFrame to dictionary in {step1_end - step1_start:.4f} seconds.")

        # Get unique chosen_id from df
        step2_start = time.time()
        chosen_ids = df["chosen_id"].unique()
        missing_ids = [cid for cid in chosen_ids if cid not in trajectories]

        if missing_ids:
            self.logger.warning(f"⚠️ The following chosen_id do not exist in the trajectory data and will be excluded: {missing_ids}")

        df = df[df["chosen_id"].isin(trajectories)] # Filter out missing IDs in the DataFrame
        chosen_ids = df["chosen_id"].unique() # Update chosen_ids without missing IDs
        step2_end = time.time()
        self.logger.info(f"✅ Step 2: Extracted unique chosen_id in {step2_end - step2_start:.4f} seconds.")

        # Generate all possible pairs (excluding self-comparison)
        step3_start = time.time()
        i_uwi, k_uwi = np.meshgrid(chosen_ids, chosen_ids, indexing='ij')
        i_uwi, k_uwi = i_uwi.ravel(), k_uwi.ravel()

        # Remove self-comparisons
        valid_mask = i_uwi != k_uwi
        i_uwi, k_uwi = i_uwi[valid_mask], k_uwi[valid_mask]
        step3_end = time.time()
        self.logger.info(f"✅ Step 3: Generated well pairs in {step3_end - step3_start:.4f} seconds.")

        # 🚀 Optimized Heel/Toe Extraction (Vectorized)
        step4_start = time.time()
        heel_toe_df = pd.concat(
            [self.extract_heel_toe_mid_lat_lon(trajectories[cid]) for cid in chosen_ids], ignore_index=True
        )
        heel_toe_dict = heel_toe_df.set_index("chosen_id").to_dict(orient="index")
        step4_end = time.time()
        self.logger.info(f"✅ Step 4: Heel/Toe extraction took {step4_end - step4_start:.4f} seconds.")

        # Efficiently extract values using vectorized lookups
        step5_start = time.time()
        mid_lat_i = np.array([heel_toe_dict[i]["mid_Lat"] for i in i_uwi])
        mid_lon_i = np.array([heel_toe_dict[i]["mid_Lon"] for i in i_uwi])
        mid_lat_k = np.array([heel_toe_dict[k]["mid_Lat"] for k in k_uwi])
        mid_lon_k = np.array([heel_toe_dict[k]["mid_Lon"] for k in k_uwi])
        step5_end = time.time()
        self.logger.info(f"✅ Step 5: Heel/Toe dictionary lookup took {step5_end - step5_start:.4f} seconds.")

        # 🚀 Optimized Distance Calculation (Fully Vectorized)
        step6_start = time.time()
        horizontal_dist, vertical_dist, total_3D_dist = self.calculate_3D_distance_matrix(trajectories, i_uwi, k_uwi)
        step6_end = time.time()
        self.logger.info(f"✅ Step 6: Distance calculations took {step6_end - step6_start:.4f} seconds.")

        # Compute drill directions
        step7_start = time.time()
        drill_directions = self.calculate_drill_direction_vectorized(trajectories, i_uwi)
        step7_end = time.time()
        self.logger.info(f"✅ Step 7: Drill direction calculation took {step7_end - step7_start:.4f} seconds.")

        # Determine directional relationship
        step8_start = time.time()
        ward_of_i = self.get_direction(mid_lat_i, mid_lon_i, mid_lat_k, mid_lon_k)
        step8_end = time.time()
        self.logger.info(f"✅ Step 8: Directional relationship calculation took {step8_end - step8_start:.4f} seconds.")

        # Compute mean elevation
        step9_start = time.time()
        trajectories = pd.concat(trajectories).reset_index(drop=True) # Convert trajectories dict to pandas dataframe
        elevation_df = trajectories.groupby("chosen_id", as_index=False)["z"].mean().rename(columns={"z": "elevation"})
        elevation_dict = elevation_df.set_index("chosen_id")["elevation"].to_dict()

        # Add elevation values to pairs
        elevation_i = np.array([elevation_dict.get(i, np.nan) for i in i_uwi])
        elevation_k = np.array([elevation_dict.get(k, np.nan) for k in k_uwi])
        step9_end = time.time()
        self.logger.info(f"✅ Step 9: Mean elevation calculation took {step9_end - step9_start:.4f} seconds.")

        # Create DataFrame
        step10_start = time.time()
        result_df = pd.DataFrame({
            "i_uwi": i_uwi,
            "k_uwi": k_uwi,
            "horizontal_dist": horizontal_dist,
            "vertical_dist": vertical_dist,
            "3D_ft_dist": total_3D_dist,
            "drill_direction": drill_directions,
            "ward_of_i": ward_of_i,
            "elevation_i": elevation_i,
            "elevation_k": elevation_k
        })
        step10_end = time.time()
        self.logger.info(f"✅ Step 10: Created result DataFrame in {step10_end - step10_start:.4f} seconds.")

        total_time = time.time() - start_time
        self.logger.info(f"🚀 Total Execution Time: {total_time:.4f} seconds.")

        return result_df

## 1. Testing

In [3]:
# df_header_MB = pd.read_csv(r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\MB Investigation\All_MB_Header.csv", dtype={'chosen_id': object})

In [4]:
logger_path = r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\src\logs"

In [5]:
spacingCalc = SpacingIKPairs(db = DatabricksOdbcConnector(), 
                             log_dir=logger_path) # Creating an instance of the SpacingIKPairs class

[SpacingIKLogger] INFO (03-02 10:11 PM): No valid header source provided. Querying from Databricks SQL... (Line: 33) [2228399244.py]

  result_df = pd.read_sql(sql_query, self.connection)
[SpacingIKLogger] INFO (03-02 10:12 PM): Header DataFrame loaded from SQL. (Line: 35) [2228399244.py]



In [6]:
df_directional = spacingCalc.get_directional_survey_data() # Getting directional survey data from Databricks

In [7]:
df_dir_with_utm = spacingCalc.compute_utm_coordinates(df_directional) # Computing UTM coordinates

[SpacingIKLogger] INFO (03-02 10:18 PM): ✅ Determined UTM zones in 3.8756 seconds. (Line: 195) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:18 PM): ✅ Performed batch EPSG transformations in 1.1839 seconds. (Line: 207) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:18 PM): ✅ Assigned transformed coordinates in 0.0569 seconds. (Line: 217) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:18 PM): ✅ Total execution time: 5.1792 seconds. (Line: 225) [2228399244.py]



In [8]:
df_directional_filtered = spacingCalc.filter_after_heel_point(df_dir_with_utm) # Filtering after heel point

In [None]:
# heel_toe_mid_df = spacingCalc.extract_heel_toe_mid_lat_lon(df_directional_filtered) # Extracting heel, toe, and mid-point latitude/longitude

In [9]:
ik_pair_all_MB = spacingCalc.create_i_k_pairs(trajectories = df_directional_filtered) # Creating i_k pairs

[SpacingIKLogger] INFO (03-02 10:18 PM): ✅ Step 1: Converted trajectory DataFrame to dictionary in 0.8885 seconds. (Line: 495) [2228399244.py]


[SpacingIKLogger] INFO (03-02 10:18 PM): ✅ Step 2: Extracted unique chosen_id in 0.0459 seconds. (Line: 508) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:19 PM): ✅ Step 3: Generated well pairs in 12.8055 seconds. (Line: 519) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:20 PM): ✅ Step 4: Heel/Toe extraction took 63.0719 seconds. (Line: 528) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:22 PM): ✅ Step 5: Heel/Toe dictionary lookup took 142.1550 seconds. (Line: 537) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:25 PM): ✅ Step 6: Distance calculations took 145.7421 seconds. (Line: 543) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:25 PM): ✅ Step 7: Drill direction calculation took 32.8261 seconds. (Line: 549) [2228399244.py]

[SpacingIKLogger] INFO (03-02 10:25 PM): ✅ Step 8: Directional relationship calculation took 14.0032 

In [10]:
ik_pair_all_MB

Unnamed: 0,i_uwi,k_uwi,horizontal_dist,vertical_dist,3D_ft_dist,drill_direction,ward_of_i,elevation_i,elevation_k
0,4200347907,4200347982,108904.123402,6204.466100,109080.720082,NS,N,-10924.242273,-4719.776173
1,4200347907,4200347999,110251.673683,6226.536341,110427.357592,NS,N,-10924.242273,-4697.705932
2,4200347907,4200347948,118304.086740,2141.438997,118323.466397,NS,E,-10924.242273,-8782.803276
3,4200347907,4200347984,108806.256538,6197.066321,108982.590779,NS,N,-10924.242273,-4727.175952
4,4200347907,4200347624,13420.380395,653.043849,13436.259756,NS,E,-10924.242273,-10271.198424
...,...,...,...,...,...,...,...,...,...
301213375,4246141704,4232944841,306773.853377,723.967495,306774.707636,NS,N,-8310.222746,-9034.190240
301213376,4246141704,4232944729,243061.361856,409.537325,243061.706873,NS,N,-8310.222746,-8719.760070
301213377,4246141704,4222740994,367844.392045,298.718523,367844.513336,NS,N,-8310.222746,-8011.504222
301213378,4246141704,4238340962,144082.773377,326.243058,144083.142728,NS,E,-8310.222746,-7983.979688
