In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import folium

from sklearn.cluster import DBSCAN
from scipy.spatial import ConvexHull

from bayes_opt import BayesianOptimization

from src.database_manager import SQLTableManager
from src.custom_logger import CustomLogger

## Initiliazing the Logger

In [2]:
# Create an instance of CustomLogger with logger name and log directory
logger_instance = CustomLogger("well_clustering","clustering",r"C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Parent_Child_Spacing\src\logs")

# Get the logger
logger = logger_instance.get_logger()

## Data Import

### Reading WellHeader EXCEL to Df

In [3]:
# Reading WellHeader excel file to dataframe
df_wellheader = pd.read_excel("WellHeader.xlsx",dtype={'ChosenID':str})

### Initiliazing the Database class

In [4]:
sql = SQLTableManager()

### Importing Lat Long from Conduit DB

In [5]:
try:
    sql.connect()

    logger.debug("Reading Conduit SQL Query")

    query = f'''
                select 
                    cha.ChosenID,
                    cha.LeaseName,
                    cha.LeaseNumber,
                    cha.Field,
                    cha.CustomString2 as RES_CAT,
                    cha.CustomString0 as Landing_Zone,
                    cha.PerfLateralLength,
                    cha.FirstProdDate,
                    cha.HoleDirection,
                    cha.SurfaceLatitude,
                    cha.SurfaceLongitude,
                    cha.ToeLatitude as BH_Lat,
                    cha.ToeLongitude as BH_Long
                from Conduit.dbo.CCWellHeaderAnalysis cha
                where cha.ChosenID in {tuple(df_wellheader['ChosenID'].unique())}
            '''
    
    df_conduit_wellheader = sql.execute_query(query)
    
    # Converting First Prod Column to PandasDatetime
    df_conduit_wellheader['FirstProdDate'] = pd.to_datetime(df_conduit_wellheader['FirstProdDate'])


    logger.debug(f"Succesfully read Conduit SQL Query to df_conduit_wellheader. Top 5 Rows:\n----------------------------------------\n{df_conduit_wellheader.head()}\n----------------------------------------\n")

except Exception as e:
    logger.error(f"Error occured while reading {query} from MsSQL server. Error Details:{e}")
finally:
    sql.close_connection()

[clustering] DEBUG (05-31 08:44 PM): Reading Conduit SQL Query (Line: 4) [2342561901.py]

  result_df = pd.read_sql(sql_query, self.connection)
[clustering] DEBUG (05-31 08:44 PM): Succesfully read Conduit SQL Query to df_conduit_wellheader. Top 5 Rows:
----------------------------------------
     ChosenID                        LeaseName  LeaseNumber      Field  \
0  4238939261  JOHN PHILLIP LONG STATE 18-19 A  08-292705-G    PHANTOM   
1  4238939263  JOHN PHILLIP LONG STATE 18-19 C  08-292616-G    PHANTOM   
2  4238939262  JOHN PHILLIP LONG STATE 18-19 B  08-292637-G    PHANTOM   
3  4238939343                         ALTAI 23  08-056710-O  HOEFS T-K   
4  4238939338                 IGUANA UNIT 6B62  08-056327-O    SANDBAR   

  RES_CAT Landing_Zone  PerfLateralLength FirstProdDate HoleDirection  \
0   01PDP       3RD BS            11162.0    2021-09-01             H   
1   01PDP          WCA            11151.0    2021-09-01             H   
2   01PDP          WCA            11122.0

In [6]:
# Merging conduit dataframe with wellheader dataframe
df_wellheader_merge_conduit = df_wellheader.merge(df_conduit_wellheader,how='left', suffixes=['_df_WellHeader', '_df_Conduit'])

In [7]:
# Calculate the midpoints
df_wellheader_merge_conduit['mid_lat'] = np.where(df_wellheader_merge_conduit['BH_Lat'].isna(), df_wellheader_merge_conduit['SurfaceLatitude'], (df_wellheader_merge_conduit['SurfaceLatitude'] + df_wellheader_merge_conduit['BH_Lat']) / 2)
df_wellheader_merge_conduit['mid_long'] = np.where(df_wellheader_merge_conduit['BH_Long'].isna(), df_wellheader_merge_conduit['SurfaceLongitude'], (df_wellheader_merge_conduit['SurfaceLongitude'] + df_wellheader_merge_conduit['BH_Long']) / 2)

## EDA

In [8]:
# df_wellheader_merge_conduit.plot(kind='scatter',x='SurfaceLatitude',y='SurfaceLongitude',figsize=(10,6))

## DBSCAN

### Running Bayesian Optimization

In [9]:
def haversine_distance(lon1, lat1, lon2, lat2,**kwargs):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    All args must be of equal length.

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km


def dbscan_cluster(latitudes,longitudes,epsilon,min_samples,**kwargs):
    '''
    Function to perform DBSCAN clustering for given parameters.
    
    '''
    
    # convert epsilon from km to radians
    kms_per_radian = 6371.0088
    epsilon /= kms_per_radian
    
    # set up the algorithm
    dbscan = DBSCAN(
        eps = epsilon,
        min_samples = min_samples,
        algorithm = 'ball_tree',
        metric = 'haversine',
        **kwargs
    )
    
    # fit the algorithm
    dbscan.fit(
        np.radians(
            [x for x in zip(latitudes,longitudes)]
        )
    )
    
    # return the cluster labels
    return pd.Series(dbscan.labels_)


def vertex_centroid_distance(latitudes,longitudes,**kwargs):
    '''
    Function to calculate the average distance from the vertices of a convex hull
    (derived from latitude x longitude pairs) to the centroid of said convex hull.
    
    Centroid is taken to be the unweighted average of all co-ordinate pairs.
    
    '''
    
    # co-ordinates of centre
    # take a simple average
    centre_long = longitudes.mean()
    centre_lats = latitudes.mean()
    
    # collapse two points into line
    if len(latitudes) < 3:
        distances = haversine_distance(
            longitudes,
            latitudes,
            centre_long,
            centre_lats,
            **kwargs).mean()
    
    else:
        # convex hull
        convex_hull = ConvexHull([x for x in zip(latitudes,longitudes)],**kwargs)

        # now get co-ordinates of vertices
        vertex_longs = longitudes.iloc[convex_hull.vertices]
        vertex_lats = latitudes.iloc[convex_hull.vertices]

        # now get
        distances = haversine_distance(
            vertex_longs,
            vertex_lats,
            centre_long,
            centre_lats,
            **kwargs).mean()

    # return average distance
    return distances.mean() if not np.isnan(distances) else 0.0


def calculate_average_values_in_disctionary(dictionary:dict):
    return -1*(sum(dictionary.values())/len(dictionary)) if dictionary else None

In [10]:
def black_box_function(epsilon, min_samples):
    """Function with unknown internals we wish to maximize.

    This is just serving as an example, for all intents and
    purposes think of the internals of this function, i.e.: the process
    which generates its output values, as unknown.
    """
    df = df_wellheader_merge_conduit[['mid_lat','mid_long']].copy()

    df.drop_duplicates(inplace=True)

    df['cluster'] = dbscan_cluster(latitudes=df['mid_lat'],longitudes=df['mid_long'],
                                   epsilon=epsilon,min_samples=int(min_samples))
    
    vertex_dist = {}

    for cluster in df['cluster'].unique():

        df_cluster_lat_longs = df[df['cluster']==cluster].loc[:,['mid_lat','mid_long']].copy()

        vertex_dist[cluster] = vertex_centroid_distance(latitudes=df_cluster_lat_longs['mid_lat'], longitudes=df_cluster_lat_longs['mid_long'])

    return calculate_average_values_in_disctionary(vertex_dist)

In [33]:
# Bounded region of parameter space
pbounds = {'epsilon': (0.1, 0.5), 'min_samples': (1, 3)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=0,
    allow_duplicate_points=True
)

In [34]:
optimizer.maximize()

|   iter    |  target   |  epsilon  | min_sa... |
-------------------------------------------------
| [0m1        [0m | [0m-3.639   [0m | [0m0.3195   [0m | [0m2.43     [0m |
| [0m2        [0m | [0m-3.685   [0m | [0m0.3411   [0m | [0m2.09     [0m |
| [95m3        [0m | [95m-3.615   [0m | [95m0.2695   [0m | [95m2.292    [0m |
| [0m4        [0m | [0m-3.661   [0m | [0m0.275    [0m | [0m2.784    [0m |
| [95m5        [0m | [95m-2.283   [0m | [95m0.4855   [0m | [95m1.767    [0m |
| [0m6        [0m | [0m-2.309   [0m | [0m0.5      [0m | [0m1.58     [0m |
| [95m7        [0m | [95m-1.172   [0m | [95m0.1571   [0m | [95m1.546    [0m |
| [0m8        [0m | [0m-3.951   [0m | [0m0.4326   [0m | [0m2.28     [0m |
| [95m9        [0m | [95m-0.7098  [0m | [95m0.1      [0m | [95m1.228    [0m |
| [0m10       [0m | [0m-3.636   [0m | [0m0.3177   [0m | [0m2.646    [0m |
| [0m11       [0m | [0m-0.8445  [0m | [0m0.111    [0m | [

### Final DB Cluster

In [27]:
def plot_clusters(df:pd.DataFrame,eps:float,min_samp:int):

    df_result = df.copy()

    df_result['cluster'] = dbscan_cluster(latitudes=df_result['mid_lat'],longitudes=df_result['mid_long'],
                                epsilon=eps, min_samples=min_samp)
    

    m = folium.Map(location=[df_result['mid_lat'].mean(), df_result['mid_long'].mean()], 
                tiles="OpenStreetMap", zoom_start=11)
    
    # Create a colormap for the unique cluster labels
    cmap = plt.get_cmap('hsv', len(df_result['cluster'].unique()))

    # Create a color dictionary for each unique cluster
    colors = {cluster: matplotlib.colors.rgb2hex(cmap(i)) for i, cluster in enumerate(df_result['cluster'].unique())}

    # Add a circle marker for each point
    for _, row in df_result.iterrows():
        folium.CircleMarker(
            location=[row['mid_lat'], row['mid_long']],
            radius=5, # Defines the radius of the circle marker
            color="white",
            fill=True,
            popup = f"{row['WellName']}, Cluster: {row['cluster']}",
            fill_color=colors[row['cluster']],
            fill_opacity=1
        ).add_to(m)


    # Return the map
    return m, df_result

In [37]:
m,df_with_clusters = plot_clusters(df=df_wellheader_merge_conduit, eps=0.1, min_samp=1)

In [38]:
m

In [42]:
df_with_clusters.to_clipboard(sep=',',index=False)