# Analyzing Well Bundles

## 1. Importing / Installing Packages

In [17]:
import pandas as pd

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)

import numpy as np # Importing numpy package

from itertools import product
from typing import Tuple

import re # Importing regular expression package

from typing import List

from src.database_manager import DatabricksOdbcConnector

from itertools import product
from scipy.spatial.distance import cdist

## 2. Loading Excel/csv into Pandas DataFrame

In [3]:
df_raw = pd.read_csv('wellHeader_with_Cluster.csv',dtype={'ChosenID':str},parse_dates=['FirstProdDate','Comp_Dt'])

In [4]:
# Renaming cluster column to bundle
df_raw = df_raw.rename(columns={'cluster':'bundle'})

In [5]:
# Sort by a specific column (e.g., 'bundle') in ascending order
df_raw.sort_values(by='bundle', ascending=True, ignore_index=True, inplace=True)

In [6]:
df_raw.shape

(2514, 20)

## 3. Data Preprocessing

### 3.1. Defining Functions


In [7]:
def reorder_columns(df: pd.DataFrame, columns_to_move: List[str], reference_column: str) -> pd.DataFrame:
    """
    Reorders the columns of a dataframe by moving specified columns next to a reference column.

    Parameters:
    df (pd.DataFrame): The dataframe whose columns need to be reordered.
    columns_to_move (List[str]): The names of the columns to move.
    reference_column (str): The name of the column next to which the specified columns should be placed.

    Returns:
    pd.DataFrame: The dataframe with reordered columns.
    """
    columns_order: List[str] = df.columns.tolist()  # Get current column order as a list
    if not all(col in columns_order for col in columns_to_move) or reference_column not in columns_order:
        raise ValueError("Specified columns must exist in the dataframe")
    
    # Find the index of the reference column
    ref_idx: int = columns_order.index(reference_column)
    
    # Remove the columns to move from their current positions
    for col in columns_to_move:
        columns_order.remove(col)
    
    # Insert the columns to move next to the reference column
    for col in reversed(columns_to_move):
        columns_order.insert(ref_idx + 1, col)
    
    # Reorder the dataframe columns
    return df[columns_order]

### 3.2. Creating Columns

In [8]:
# Creating DSU columns names from Lease Name columns

df_raw['DSU'] = df_raw['LeaseName'].apply(
    lambda x: re.sub(r'[^a-zA-Z\s]', ' ',  # Remove special characters, keep letters and spaces
                     re.match(r'([^\d]+)', str(x)).group(1) if pd.notna(x) and re.match(r'([^\d]+)', str(x)) else str(x))  
                    .strip()  # Strip leading/trailing spaces
).replace(r'\s+', ' ', regex=True)  # Collapse multiple spaces into a single space

# Placing DSU next to LeaseName
df_raw = reorder_columns(df=df_raw, columns_to_move=['DSU'], reference_column='LeaseName')

## 4. Feature Engineering

### 4.1. Creating dataframes that have more than one unique bundles or DSU

In [9]:
# Filter rows where the same DSU has more than one unique bundle
same_DSU_diffBundle_df = df_raw[df_raw.groupby("DSU")["bundle"].transform("nunique") > 1]

In [10]:
# Filter rows where the same bundle has more than one unique DSU
same_Bundle_diffDSU_df = df_raw[df_raw.groupby("bundle")["DSU"].transform("nunique") > 1]

### 4.2. Defining Function: Haversine_distance

In [11]:
def haversine_distance(lon1: np.ndarray, lat1: np.ndarray, lon2: np.ndarray, lat2: np.ndarray, unit: str = "km") -> np.ndarray:
    """Calculate the great-circle distance between two points on the Earth using the Haversine formula."""
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c  # Distance in kilometers
    return km * 3280.84 if unit == "feet" else km  # Convert to feet if needed


def calculate_overlap(length1: np.ndarray, length2: np.ndarray) -> np.ndarray:
    """Calculate the percentage overlap between two lateral lengths."""
    return np.minimum(length1, length2) / np.maximum(length1, length2) * 100


def get_direction(lat1: np.ndarray, lon1: np.ndarray, lat2: np.ndarray, lon2: np.ndarray) -> np.ndarray:
    """Determine the relative direction of (lat2, lon2) with respect to (lat1, lon1)."""
    lat_diff = lat2 - lat1
    lon_diff = lon2 - lon1

    conditions = [
        np.abs(lat_diff) > np.abs(lon_diff),  # More north/south
        lat_diff > 0,  # North
        lon_diff > 0   # East
    ]

    choices = ["North", "South", "East", "West"]
    
    return np.select(
        [conditions[0] & conditions[1], conditions[0] & ~conditions[1], ~conditions[0] & conditions[2], ~conditions[0] & ~conditions[2]],
        choices
    )


def create_i_k_pairs(df: pd.DataFrame) -> pd.DataFrame:
    """Generate the i_k_pairs DataFrame using efficient NumPy operations."""
    n = len(df)
    indices = np.array(list(product(range(n), repeat=2)))  # Create all (i, k) index pairs
    i_idx, k_idx = indices[:, 0], indices[:, 1]

    # Filter out self-comparisons
    mask = i_idx != k_idx
    i_idx, k_idx = i_idx[mask], k_idx[mask]

    # Extract values based on indices
    i_uwi = df.iloc[i_idx]["ChosenID"].values
    k_uwi = df.iloc[k_idx]["ChosenID"].values

    ll_perc = calculate_overlap(df.iloc[i_idx]["PerfLateralLength"].values, df.iloc[k_idx]["PerfLateralLength"].values)

    lin_dist = haversine_distance(
        df.iloc[i_idx]["mid_long"].values,
        df.iloc[i_idx]["mid_lat"].values,
        df.iloc[k_idx]["mid_long"].values,
        df.iloc[k_idx]["mid_lat"].values,
        unit="feet"
    )

    ward_of_i = get_direction(
        df.iloc[i_idx]["mid_lat"].values,
        df.iloc[i_idx]["mid_long"].values,
        df.iloc[k_idx]["mid_lat"].values,
        df.iloc[k_idx]["mid_long"].values
    )

    # Create DataFrame
    return pd.DataFrame({
        "i_uwi": i_uwi,
        "k_uwi": k_uwi,
        "ll_perc": ll_perc,
        "lin_dist": lin_dist,
        "ward_of_i": ward_of_i
    })

In [12]:
# Create the i_k_pairs DataFrame for wells that are Horizontals only
# i_k_pairs = create_i_k_pairs(df_raw)

In [13]:
# i_k_pairs[(i_k_pairs['i_uwi']=='4238939261') & (i_k_pairs['k_uwi']=='4238939263')]

In [None]:
df_raw.head(1)

In [14]:
databricks = DatabricksOdbcConnector()

try:
    databricks.connect()

    query = f"""
    select * from ihs_sp.well.well_directional_survey_station
    where uwi = '42389392610000' and inclination >= 89;
    """

    df_directional = databricks.execute_query(query)

except Exception as e:
    print(f"Error: {e}")

finally:
    databricks.close_connection()

  result_df = pd.read_sql(sql_query, self.connection)


In [15]:
df_directional.head(2)

Unnamed: 0,azimuth,azimuth_uom,basin,country,county,data_source,depth_obs_no,dog_leg_severity,ew_direction,formation,formation_code,inclination,inclination_uom,inserted_date,latitude,location_ref,longitude,ns_direction,period_obs_no,play,play_type,point_type,province_state,row_id,row_quality,ss_tvd_metric,ss_tvd_metric_uom,ss_tvd_uscust,ss_tvd_uscust_uom,station_md,station_md_metric,station_md_metric_uom,station_md_uom,station_md_uscust,station_md_uscust_uom,station_tvd,station_tvd_metric,station_tvd_metric_uom,station_tvd_uom,station_tvd_uscust,station_tvd_uscust_uom,sub_basin,sub_play,survey_id,updated_date,uwi,vertical_section,vertical_section_metric,vertical_section_metric_uom,vertical_section_uom,vertical_section_uscust,vertical_section_uscust_uom,wkt,geography,x_offset,x_offset_metric,x_offset_metric_uom,x_offset_uom,x_offset_uscust,x_offset_uscust_uom,y_offset,y_offset_metric,y_offset_metric_uom,y_offset_uom,y_offset_uscust,y_offset_uscust_uom
0,191.46,DEG,PERMIAN BASIN,UNITED STATES OF AMERICA,REEVES,IHS NATIVE DATA,206.0,,WEST,,,91.05,DEG,2021-03-30 13:32:30,31.368031,WGS84,-103.3203,SOUTH,,BONE SPRING,SHALE,,TEXAS,423893926100001206,,-2403.8,M,-7886.47,FT,18613.0,5673.2424,M,FT,18613.0,FT,10477.087,3193.41612,M,FT,10477.087,FT,DELAWARE BASIN (PERMIAN BASIN),TEXAS DEEP,1,2024-10-15 05:42:12.621,42389392610000,,,,,,,POINT (-103.320299822 31.368030676),"{""coordinates"":[-103.320299822,31.368030676],""...",977.9,298.06392,M,FT,977.9,FT,7767.5,2367.534,M,FT,7767.5,FT
1,195.6,DEG,PERMIAN BASIN,UNITED STATES OF AMERICA,REEVES,IHS NATIVE DATA,189.0,,WEST,,,91.47,DEG,2021-03-30 13:32:30,31.372268,WGS84,-103.31901,SOUTH,,BONE SPRING,SHALE,,TEXAS,423893926100001189,,-2414.31,M,-7920.98,FT,17013.0,5185.5624,M,FT,17013.0,FT,10511.595,3203.93416,M,FT,10511.595,FT,DELAWARE BASIN (PERMIAN BASIN),TEXAS DEEP,1,2024-10-15 05:42:12.621,42389392610000,,,,,,,POINT (-103.319009585 31.372268086),"{""coordinates"":[-103.319009585,31.372268086],""...",575.66,175.46117,M,FT,575.66,FT,6219.87,1895.81638,M,FT,6219.87,FT


In [16]:
df_directional.columns

Index(['azimuth', 'azimuth_uom', 'basin', 'country', 'county', 'data_source',
       'depth_obs_no', 'dog_leg_severity', 'ew_direction', 'formation',
       'formation_code', 'inclination', 'inclination_uom', 'inserted_date',
       'latitude', 'location_ref', 'longitude', 'ns_direction',
       'period_obs_no', 'play', 'play_type', 'point_type', 'province_state',
       'row_id', 'row_quality', 'ss_tvd_metric', 'ss_tvd_metric_uom',
       'ss_tvd_uscust', 'ss_tvd_uscust_uom', 'station_md', 'station_md_metric',
       'station_md_metric_uom', 'station_md_uom', 'station_md_uscust',
       'station_md_uscust_uom', 'station_tvd', 'station_tvd_metric',
       'station_tvd_metric_uom', 'station_tvd_uom', 'station_tvd_uscust',
       'station_tvd_uscust_uom', 'sub_basin', 'sub_play', 'survey_id',
       'updated_date', 'uwi', 'vertical_section', 'vertical_section_metric',
       'vertical_section_metric_uom', 'vertical_section_uom',
       'vertical_section_uscust', 'vertical_section_uscust_