In [2]:
import pandas as pd # Importing pandas package

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)

import numpy as np # Importing numpy package

from typing import Dict, Tuple, List, Union, Optional # Importing specific types from typing module

import re # Importing regular expression package

from src.database_manager import DatabricksOdbcConnector # Importing DatabricksOdbcConnector class from database_manager module
from src.utils import reorder_columns # Importing reorder_columns function from utils module

from scipy.spatial.distance import cdist # Importing cdist function from scipy package

import time # Importing Time Module

import pyproj # Importing pyproj package

from src.custom_logger import CustomLogger # Importing CustomLogger class from custom

import os #

In [None]:
class GeoSurveyProcessor:
    """
    A class for processing directional survey data and performing geospatial transformations
    such as converting lat/lon to UTM, filtering heel points, and extracting key well locations.
    """
    def __init__(
            self,
            directional_df: pd.DataFrame = None,
            log_dir: str = "./logs",
            ):
        """
        Initializes the GeoSurveyProcessor with optional header data and log directory.
        :param header: Optional DataFrame containing header information.
        :param log_dir: Directory for logging.
        """
        self.logger = CustomLogger("geo_logger", "GeoLogger", log_dir).get_logger()  # Custom logger

        if isinstance(directional_df, pd.DataFrame):
            self.logger.info("Initialized with provided DataFrame.")
        else:
            self.directional_df = pd.DataFrame()
            self.logger.warning("Directional Survey DataFrame is empty or not provided.")
        self.logger.info("GeoSurveyProcessor initialized.")

    def determine_utm_zone(self, longitude: float) -> int:
        """
        Determines the UTM zone based on a given longitude.
        """
        return int((longitude + 180) / 6) + 1
        
    def batch_latlon_to_utm(self, lat: np.ndarray, lon: np.ndarray, utm_zone: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Converts arrays of latitudes and longitudes to UTM coordinates in meters for a given UTM zone.
        """
        proj_utm = pyproj.Transformer.from_crs(
            "EPSG:4326", f"EPSG:326{utm_zone}", always_xy=True
        )
        
        return proj_utm.transform(lon, lat)
    
    def compute_utm_coordinates(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Computes UTM (x, y, z) coordinates for multiple wells, using surface location to determine UTM zones.
        Converts UTM coordinates from meters to feet. Uses vectorized batch processing for performance.

        Parameters:
        - df (pd.DataFrame): Original directional survey DataFrame.

        Returns:
        - pd.DataFrame: DataFrame with all original columns + x, y, z (in feet), and utm_zone.
        """
        start_time = time.time()  # Start timing

        # Step 1: Sort dataframe by md to identify surface location
        df = df.sort_values(by=["chosen_id", "md"], ascending=[True, True])
        
        # Step 2: Determine UTM zones using the surface location (first row per well)
        surface_locs = df.groupby("chosen_id").first()[["latitude", "longitude"]]
        surface_locs["utm_zone"] = surface_locs["longitude"].apply(self.determine_utm_zone)

        # Merge UTM zones back into the original dataframe
        df = df.merge(surface_locs[["utm_zone"]], on="chosen_id", how="left")

        self.logger.info(f"✅ Determined UTM zones in {time.time() - start_time:.4f} seconds.")

        # Step 3: Batch transformation for each unique UTM zone
        start_transform_time = time.time()
        unique_zones = df["utm_zone"].unique()
        utm_converters: Dict[int, Tuple[np.ndarray, np.ndarray]] = {}

        for zone in unique_zones:
            subset = df[df["utm_zone"] == zone]
            easting, northing = self.batch_latlon_to_utm(subset["latitude"].values, subset["longitude"].values, zone)
            utm_converters[zone] = (easting, northing)

        self.logger.info(f"✅ Performed batch EPSG transformations in {time.time() - start_transform_time:.4f} seconds.")

        # Step 4: Assign the converted coordinates back to the DataFrame
        start_assign_time = time.time()
        df["x"], df["y"] = np.zeros(len(df)), np.zeros(len(df))

        for zone in unique_zones:
            mask = df["utm_zone"] == zone
            df.loc[mask, "x"], df.loc[mask, "y"] = utm_converters[zone]

        self.logger.info(f"✅ Assigned transformed coordinates in {time.time() - start_assign_time:.4f} seconds.")

        # Step 5: Convert UTM coordinates from meters to feet (Conversion factor: 1 meter = 3.28084 feet)
        df["x"] *= 3.28084
        df["y"] *= 3.28084
        
        df["z"] = -df["tvd"] # Elevation is negative TVD

        self.logger.info(f"✅ Total execution time: {time.time() - start_time:.4f} seconds.")

        return df
    
    def filter_after_heel_point(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Filters the dataframe to include all rows for each chosen_id where the first occurrence 
        of either '80' or 'heel' appears in the point_type column and all subsequent rows.

        Parameters:
        df (pd.DataFrame): A dataframe containing directional survey data with a 'chosen_id' column and 'point_type' column.

        Returns:
        pd.DataFrame: Filtered dataframe containing rows from the first occurrence of '80' or 'heel' onward.
        """

        # Convert 'point_type' to lowercase and check for '80' or 'heel'
        mask = df['point_type'].str.lower().str.contains(r'80|heel', regex=True, na=False)

        # Identify the first occurrence for each chosen_id
        idx_start = df[mask].groupby('chosen_id', sort=False).head(1).index

        # Create a mapping of chosen_id to the starting index
        start_idx_map = dict(zip(df.loc[idx_start, 'chosen_id'], idx_start))

        # Create a boolean mask using NumPy to filter rows
        chosen_ids = df['chosen_id'].values
        indices = np.arange(len(df))

        # Get the minimum start index for each row's chosen_id
        start_indices = np.vectorize(start_idx_map.get, otypes=[float])(chosen_ids)

        # Mask rows where index is greater than or equal to the start index
        valid_rows = indices >= start_indices

        return df[valid_rows].reset_index(drop=True)
    
    def get_heel_toe_midpoints_latlon(self, well_trajectory: pd.DataFrame) -> pd.DataFrame:
        """
        Extract the heel, toe, and mid-point latitude/longitude for each chosen_id in the well trajectory DataFrame
        that has been filtered to have lateral section of the well.

        Parameters:
        well_trajectory: pd.DataFrame
            DataFrame containing well trajectory data, including 'chosen_id', 'md', 'latitude', and 'longitude'.

        Returns:
        pd.DataFrame
            A DataFrame with 'chosen_id', 'Heel_Lat', 'Heel_Lon', 'Toe_Lat', 'Toe_Lon', 'Mid_Lat', 'Mid_Lon'.

        Example:
        >>> data = {
        ...     "chosen_id": [1001, 1001, 1001, 1002, 1002],
        ...     "md": [5000, 5100, 5200, 6000, 6100],
        ...     "latitude": [31.388, 31.389, 31.387, 31.400, 31.401],
        ...     "longitude": [-103.314, -103.315, -103.316, -103.318, -103.319]
        ... }
        >>> df = pd.DataFrame(data)
        >>> extract_heel_toe_mid_lat_lon(df)
        chosen_id  Heel_Lat  Heel_Lon  Toe_Lat  Toe_Lon  Mid_Lat  Mid_Lon
        0     1001    31.388  -103.314   31.387  -103.316  31.3875 -103.315
        1     1002    31.400  -103.318   31.401  -103.319  31.4005 -103.3185
        """
        # Ensure the data is sorted by MD in ascending order
        well_trajectory = well_trajectory.sort_values(by=["chosen_id", "md"], ascending=True)

        # Group by 'chosen_id' and extract heel/toe lat/lon
        heel_toe_df = (
            well_trajectory.groupby("chosen_id")
            .agg(
                heel_lat=("latitude", "first"),
                heel_lon=("longitude", "first"),
                toe_lat=("latitude", "last"),
                toe_lon=("longitude", "last"),
            )
            .reset_index()
        )

        # Calculate midpoints
        heel_toe_df["mid_Lat"] = (heel_toe_df["heel_lat"] + heel_toe_df["toe_lat"]) / 2
        heel_toe_df["mid_Lon"] = (heel_toe_df["heel_lon"] + heel_toe_df["toe_lon"]) / 2

        return heel_toe_df