# Analyzing Well Bundles

## 1. Importing / Installing Packages

In [1]:
import pandas as pd

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)

import re # Importing regular expression package

from typing import List

## 2. Loading Excel/csv into Pandas DataFrame

In [2]:
df_raw = pd.read_csv('wellHeader_with_Cluster.csv',dtype={'ChosenID':str},parse_dates=['FirstProdDate','Comp_Dt'])

In [3]:
# Renaming cluster column to bundle
df_raw = df_raw.rename(columns={'cluster':'bundle'})

In [4]:
df_raw.shape

(2514, 20)

## 3. Data Preprocessing

### 3.1. Data Cleaning


In [5]:
def reorder_columns(df: pd.DataFrame, columns_to_move: List[str], reference_column: str) -> pd.DataFrame:
    """
    Reorders the columns of a dataframe by moving specified columns next to a reference column.

    Parameters:
    df (pd.DataFrame): The dataframe whose columns need to be reordered.
    columns_to_move (List[str]): The names of the columns to move.
    reference_column (str): The name of the column next to which the specified columns should be placed.

    Returns:
    pd.DataFrame: The dataframe with reordered columns.
    """
    columns_order: List[str] = df.columns.tolist()  # Get current column order as a list
    if not all(col in columns_order for col in columns_to_move) or reference_column not in columns_order:
        raise ValueError("Specified columns must exist in the dataframe")
    
    # Find the index of the reference column
    ref_idx: int = columns_order.index(reference_column)
    
    # Remove the columns to move from their current positions
    for col in columns_to_move:
        columns_order.remove(col)
    
    # Insert the columns to move next to the reference column
    for col in reversed(columns_to_move):
        columns_order.insert(ref_idx + 1, col)
    
    # Reorder the dataframe columns
    return df[columns_order]

In [6]:
# Creating DSU columns names from Lease Name columns

df_raw['DSU'] = df_raw['LeaseName'].apply(
    lambda x: re.sub(r'[^a-zA-Z\s]', '',  # Remove special characters, keep letters and spaces
                     (re.match(r'([^\d]+)', str(x)).group(1)  # Extract text before first digit (if exists)
                      if pd.notna(x) and re.match(r'([^\d]+)', str(x)) else str(x))  # Use original if no number
                    ).strip()  # Strip leading/trailing spaces
)

# Placing DSU next to LeaseName
df_raw = reorder_columns(df=df_raw, columns_to_move=['DSU'], reference_column='LeaseName')

In [7]:
df_raw.head()

Unnamed: 0,ChosenID,Basin,WellName,County,LeaseName,DSU,bundle,LeaseNumber,Field,RES_CAT,Landing_Zone,PerfLateralLength,FirstProdDate,Comp_Dt,HoleDirection,SurfaceLatitude,SurfaceLongitude,BH_Lat,BH_Long,mid_lat,mid_long
0,4238939261,DB,JOHN PHILLIP LONG STATE 18-19 A 134H,REEVES,JOHN PHILLIP LONG STATE 18-19 A,JOHN PHILLIP LONG STATE,0,08-292705-G,PHANTOM,01PDP,3RD BS,11162.0,2021-09-01,2021-09-01,H,31.389298,-103.317163,31.358963,-103.321829,31.37413,-103.319496
1,4238939263,DB,JOHN PHILLIP LONG STATE 18-19 C 233H,REEVES,JOHN PHILLIP LONG STATE 18-19 C,JOHN PHILLIP LONG STATE,0,08-292616-G,PHANTOM,01PDP,WCA,11151.0,2021-09-01,2021-09-02,H,31.389406,-103.317137,31.359537,-103.324733,31.374471,-103.320935
2,4238939262,DB,JOHN PHILLIP LONG STATE 18-19 B 221H,REEVES,JOHN PHILLIP LONG STATE 18-19 B,JOHN PHILLIP LONG STATE,0,08-292637-G,PHANTOM,01PDP,WCA,11122.0,2021-09-01,2021-09-02,H,31.38919,-103.317189,31.360021,-103.327567,31.374605,-103.322378
3,4238939343,DB,ALTAI 23 1BS,REEVES,ALTAI 23,ALTAI,1,08-056710-O,HOEFS T-K,01PDP,3RD BS,11750.0,2021-11-01,2021-11-18,H,31.37154,-103.261983,31.403367,-103.256202,31.387453,-103.259093
4,4238939338,DB,IGUANA UNIT 6B62 62H,REEVES,IGUANA UNIT 6B62,IGUANA UNIT,2,08-056327-O,SANDBAR,01PDP,2ND BS,8433.0,2021-09-01,2021-09-13,H,31.264904,-103.177265,31.28802,-103.177203,31.276462,-103.177234
