# Texas Comptroller of Public Accounts - Scraper

## Importing Libraries

In [1]:
# Importing Libraries
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

from io import StringIO

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


import numpy as np

from typing import Optional

import time

## Creating Scraper Class

In [2]:
class _LeaseDropNaturalGas_WebScraper:

    def __init__(self) -> None:
        self.site_key: str = '6Lf6Z5sUAAAAACg7ECAeRMcnAo2_WfoKUeNYXkj_'
        self.login_url: str = 'https://mycpa.cpa.state.tx.us/cong/loginForward.do?phase=check'
        self.ngl_drop_url: str = 'https://mycpa.cpa.state.tx.us/cong/leaseDropNGAction.do'
        self.xpath_leaseNo: str = '//*[@id="leaseNum"]'
        self.xpath_begDt: str = '//*[@id="begFilPrd"]'
        self.xpath_endDt: str = '//*[@id="endFilPrd"]'
        self.xpath_submitForm: str = '//*[@id="leaseDropNGForm"]/span[7]/p/input'
        self.xpath_lease_table: str = '//*[@id="menucontenttable"]/table/tbody/tr/td[2]/div/table'
        self.driver: Optional[WebDriver] = None
        self._initialize_driver()


    def _initialize_driver(self) -> None:
        options = webdriver.ChromeOptions()
        # options.add_argument('--headless')  # Optional: run in headless mode
        options.add_argument('--disable-gpu')  # Optional: disable GPU
        options.add_argument('--no-sandbox')  # Optional: required for some environments

        self.driver = webdriver.Chrome(options=options)


    def _load_page(self) -> None:
            
            if self.driver is None:
                raise RuntimeError("WebDriver is not initialized.")
            
            self.driver.maximize_window()
            self.driver.get(self.login_url)
            time.sleep(0.8)
            self.driver.get(self.ngl_drop_url)
            wait = WebDriverWait(self.driver, 1)
            wait.until(lambda d: d.execute_script("return typeof grecaptcha !== 'undefined'"))


    def _get_recaptcha_token(self) -> str:
        
        if self.driver is None:
            raise RuntimeError("WebDriver is not initialized.")

        self._load_page()
        
        token = self.driver.execute_script(f'''
            return grecaptcha.execute('{self.site_key}', {{action: 'homepage'}}).then(function(token) {{
                return token;
            }});
        ''')
        
        return token
    

    def _get_NGL_Inquiry_html(self, lease_no: str, beg_dt: str, end_dt: str) -> str:
        """
        Navigate to the specified URL and return the page's HTML content.
        
        Args:
            lease_no (str): The 6-digits lease number.
            beg_dt (str): Begining period (yymm or yy)
            end_dt (str): Ending period (yymm or yy)
        
        Returns:
            str: The HTML content of the page.
        """

        if self.driver is None:
            raise RuntimeError("WebDriver is not initialized.")
        
        self._load_page()

        # Filling Lease Number
        self.driver.find_element(By.XPATH, self.xpath_leaseNo).send_keys(lease_no)

        # Filling Begining Period
        self.driver.find_element(By.XPATH, self.xpath_begDt).send_keys(beg_dt)

        # Filling Ending Period
        self.driver.find_element(By.XPATH, self.xpath_endDt).send_keys(end_dt)

        # Running the Inquiry Form
        time.sleep(0.5)
        self.driver.find_element(By.XPATH, self.xpath_submitForm).click()

        # Visibility of the table header element
        try:
            # Wait until the table is located or timeout occurs
            lease_table = WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, self.xpath_lease_table)
                )
            )

            if lease_table:
                return self.driver.page_source
            
        except Exception as e:
            print("Lease table not found.",e)
            return False
    

    def _clear_entry_labels(self) -> None:
        """
        Clear the input labels.
        
        Returns:
            None
        """
        # Clearing Lease Number
        self.driver.find_element(By.XPATH, self.xpath_leaseNo).clear()

        # Clearing Begining Period
        self.driver.find_element(By.XPATH, self.xpath_begDt).clear()

        # Clearing Ending Period
        self.driver.find_element(By.XPATH, self.xpath_endDt).clear()


    def _parse_html(self, html: str, df_raw: bool = False) -> pd.DataFrame:
        """
        Parsing HTML content to Beautiful Soup and return the cleaned DataFrame
        
        Args:
            html (str): The HTML page source.
        
        Returns:
            pd.DataFrame: The cleaned up Pandas DataFrame.
        """
        soup = BeautifulSoup(html,'html.parser')

        df_raw = pd.read_html(StringIO(str(soup.find_all('table'))))

        df_LeaseNGL_raw = df_raw[1]

        # Step 1: Identify rows where 'Primary Taxpayer #' contains 'Period' and extract the date part
        df_LeaseNGL_raw['prod_dt'] = np.where(
            df_LeaseNGL_raw['Primary Taxpayer #'].str.contains('Period', na=False),
            df_LeaseNGL_raw['Primary Taxpayer #'].str.extract(r'Period: (\d{4})', expand=False),
            np.nan
        )

        # Step 2: Forward fill the 'prod_dt' column to propagate the last valid date value
        df_LeaseNGL_raw['prod_dt'] = df_LeaseNGL_raw['prod_dt'].ffill()


        # Step 3: Convert 'prod_dt' from 'YYMM' to datetime format 'YYYY-MM-DD'
        df_LeaseNGL_raw['prod_dt'] = pd.to_datetime(df_LeaseNGL_raw['prod_dt'], format='%y%m')

        # Step 4: Filter out rows where column 'Primary Taxpayer #' contains 'Period'
        df_LeaseNGL_cleaned = df_LeaseNGL_raw[~df_LeaseNGL_raw['Primary Taxpayer #'].str.contains('Period', na=False)].reset_index(drop=True)

        # Step 5: Clean column names
        df_LeaseNGL_cleaned.columns = df_LeaseNGL_cleaned.columns.str.lower()  # Convert to lowercase
        df_LeaseNGL_cleaned.columns = df_LeaseNGL_cleaned.columns.str.replace('#', '')  # Remove '#' character
        df_LeaseNGL_cleaned.columns = df_LeaseNGL_cleaned.columns.str.replace(' ', '_')  # Replace spaces with underscores

        return df_LeaseNGL_cleaned


    def _quit(self) -> None:
        if self.driver is not None:
            self.driver.close()
            self.driver.quit()
            self.driver = None

In [13]:
# Testing the scraper

scraper = _LeaseDropNaturalGas_WebScraper()

try:
    # Fill the form and get the HTML content
    html_content = scraper._get_NGL_Inquiry_html(lease_no='017147', beg_dt='2301', end_dt='2302')

    # Parse the HTML and get the cleaned DataFrame
    if html_content:
        df = scraper._parse_html(html=html_content)
finally:
    scraper._quit()

In [14]:
df

Unnamed: 0,sub_type,primary_taxpayer_,comm_code,lse_typ,cnty/_dpi,exmt_typ,api_nbr,off_lease,other_party_taxpayer,secondary_tp_name,tax_reimb,ttl_lease_volume,your_volume,your_value,tax_due,gr_volume,gr_value,marketing_cost,net_tax_value,tax_rate,05_tax_due,error_status,prod_dt
0,Pro,13646174337,RG,OIL,7/NO,,,YES,,,NO,48.0,48.0,$203.89,YES,0.0,$0.00,$359.68,$0.00,0.0,$0.00,NO,2023-01-01
1,Pro,17523477697,RG,OIL,165/NO,,,NO,17523480000.0,XTO ENERGY INC.,NO,0.0,1755.0,"$12,838.14",YES,2.0,$14.46,"$12,823.68",$0.00,0.0,$0.00,NO,2023-01-01
2,Pro,17523477697,RS,OIL,165/NO,,,NO,17523480000.0,XTO ENERGY INC.,NO,0.0,34603.0,"$196,970.27",YES,39.0,$222.02,"$28,148.04","$168,600.21",0.0,$0.00,NO,2023-01-01
3,Pro,17523477697,PR,OIL,165/NO,,,NO,11354020000.0,EXXONMOBIL OIL CORPORATION,NO,0.0,41720.0,"$106,352.34",YES,47.0,$120.05,"$12,843.66","$93,388.63",0.0,$0.00,NO,2023-01-01
4,Pro,17523477697,PR,OIL,165/NO,,,NO,12010880000.0,"ENERGY TRANSFER FUEL, LP",NO,0.0,544.0,"$12,356.51",YES,1.0,$13.98,"$1,443.20","$10,899.33",0.0,$0.00,NO,2023-01-01
5,Pro,17523477697,RG,OIL,231/NO,,,NO,17523480000.0,XTO ENERGY INC.,NO,0.0,97433.0,"$503,335.31",YES,110.0,$565.31,"$107,957.68","$394,812.32",0.0,$0.00,NO,2023-01-01
6,Pro,17523477697,RG,OIL,231/NO,,,NO,17605080000.0,TARGA MIDSTREAM SERVICES LLC,YES,0.0,57271.0,"$410,150.22",YES,65.0,$467.48,"$28,552.86","$381,129.88",0.0,$0.00,NO,2023-01-01
7,Pro,17523477697,PR,OIL,231/NO,,,NO,11354020000.0,EXXONMOBIL OIL CORPORATION,NO,0.0,19867.0,"$298,299.83",YES,23.0,$336.69,"$3,428.65","$294,534.49",0.0,$0.00,NO,2023-01-01
8,Pro,17523477697,PR,OIL,231/NO,,,NO,12010880000.0,"ENERGY TRANSFER FUEL, LP",NO,0.0,1789.0,"$56,038.52",YES,3.0,$63.35,$309.05,"$55,666.12",0.0,$0.00,NO,2023-01-01
9,Pro,13646174337,RG,OIL,7/NO,,,YES,,,NO,24.0,24.0,$90.42,YES,0.0,$0.00,"$6,086.49",$0.00,0.0,$0.00,NO,2023-02-01


## Reading Well Header Data from CC

In [5]:
# Reading well header csv to pandas DataFrame
df_wellheader_raw = pd.read_csv('well_header.csv',low_memory=False)

# Cleaning up the column names
df_wellheader_modified = df_wellheader_raw.copy() # Creating
df_wellheader_modified.columns = df_wellheader_modified.columns.str.lower().str.replace(' ', '_')  # Convert to lowercase and replace spaces with underscores

In [6]:
# Check the length of the values in lease_number columns
# df_wellheader_modified['lease_number'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0).unique()

In [26]:
df_wellheader_modified[df_wellheader_modified['lease_number']=='01-015328-O']

Unnamed: 0,well_name,well_number,api_14,chosen_id,abstract,acre_same_zone_spacing,allocation_type,api_10,api_12,aries_id,azimuth,basin,block,casing_id,choke_size,chosen_id_key,completion_design,completion_end_date,completion_start_date,copied_well,country,county/parish,current_operator,current_operator_alias,current_operator_code,current_operator_ticker,data_source,data_pool,date_rig_release,distance_from_base_of_zone,distance_from_top_of_zone,district,drill_end_date,drill_start_date,elevation,elevation_type,field,additive_vol_(1st_job),cluster_count_(1st_job),total_fluid/perf_ll_(1st_job),total_fluid_(1st_job),frac_vendor_(1st_job),max_injection_pressure__(1st_job),max_injection_rate__(1st_job),first_prod_date,first_prod_date_daily,first_prod_date_monthly,total_prop_(1st_job),total_prop/fluid_(1st_job),total_prop/perf_ll_(1st_job),stage_count__(1st_job),first_test_flow_tbg_press,first_test_gas_vol,first_test_gor,first_test_oil_vol,first_test_water_vol,treatment_type_(1st_job),flow_path,fluid_type,footage_in_landing_zone,formation_thickness_mean,gas_gatherer,gas_specific_gravity,created_well,ground_elevation,has_daily_data,has_monthly_data,has_directional_survey,heel_latitude,heel_longitude,hole_direction,hz_well_spacing_any_zone,hz_well_spacing_same_zone,closest_well_id_any_zone,closest_well_id_same_zone,import_type,import_date,initial_respress,initial_restemp,inpt_id,landing_zone,landing_zone_base,landing_zone_top,last_prod_date_monthly,last_prod_date_daily,lateral_length,lease_name,lease_number,lower_perforation,matrix_permeability,measured_depth,num_treatment_records,oil_api_gravity,oil_gatherer,oil_specific_gravity,pad_name,parent_child_any_zone,parent_child_same_zone,percent_in_zone,perf_lateral_length,permit_date,phdwin_id,play,porosity,previous_operator,previous_operator_alias,previous_operator_code,previous_operator_ticker,primary_product,production_method,prop_mesh_size,prop_type,range,recovery_method,additive_vol_(refrac),cluster_count_(refrac),refrac_date,total_fluid/perf_ll_(refrac),total_fluid_(refrac),frac_vendor_(refrac),max_injection_pressure_(refrac),max_injection_rate_(refrac),total_prop_(refrac),total_prop/fluid_(refrac),total_prop/perf_ll_(refrac),stage_count_(refrac),treatment_type_(refrac),rig_name,section,gas_saturation,oil_saturation,spud_date,stage_spacing,state,status,subplay,surface_latitude,surface_longitude,survey,water_saturation,target_formation,thickness,til,toe_latitude,toe_longitude,toe_in_landing_zone,toe_up,additive_vol_(all_jobs),total_cluster_(all_jobs),total_fluid/perf_ll_(all_jobs),total_fluid_(all_jobs),total_prop_(all_jobs),total_prop/fluid_(all_jobs),total_prop/perf_ll_(all_jobs),total_stages_(all_jobs),township,true_vertical_depth,tubing_depth,tubing_id,type_curve_area,upper_perforation,vt_well_spacing_any_zone,vt_well_spacing_same_zone,well_type,import_name,bench,bench_type,rsv_cat,type_curve_type,rsv_cat_reserves,rsv_cat_source,econ_areas,development_area_full,first_prod_source,status_source,aoi_status,custom_text_header_12,custom_text_header_13,custom_text_header_14,custom_text_header_15,custom_text_header_16,custom_text_header_17,custom_text_header_18,custom_text_header_19,custom_text_header_20,custom_text_header_21,custom_text_header_22,custom_text_header_23,custom_text_header_24,custom_text_header_25,risk,producing_months,development_area,str_accrual_nri_%,custom_number_header_5,custom_number_header_6,custom_number_header_7,custom_number_header_8,custom_number_header_9,custom_number_header_10,custom_number_header_11,custom_number_header_12,custom_number_header_13,custom_number_header_14,custom_number_header_15,custom_number_header_16,custom_number_header_17,boe_eur,oil_eur,gas_eur,custom_date_header_1,custom_date_header_2,custom_date_header_3,custom_date_header_4,custom_date_header_5,custom_date_header_6,custom_date_header_7,custom_date_header_8,custom_date_header_9,custom_date_header_10,kmf_wells,chambers_wells,custom_boolean_header_3,custom_boolean_header_4,custom_boolean_header_5,cum_boe,cum_oil,cum_gas,cum_gor,cum_water,cum_mmcfge,cum_boe/perf_ll,cum_gas/perf_ll,cum_oil/perf_ll,cum_water/perf_ll,cum_mmcfge/perf_ll,first_12_boe,first_12_boe/perf_ll,first_12_gas,first_12_gas/perf_ll,first_12_gor,first_12_oil,first_12_oil/perf_ll,first_12_water,first_12_water/perf_ll,first_12_mmcfge,first_12_mmcfge/perf_ll,first_6_boe,first_6_boe/perf_ll,first_6_gas,first_6_gas/perf_ll,first_6_gor,first_6_mmcfge,first_6_mmcfge/perf_ll,first_6_oil,first_6_oil/perf_ll,first_6_water,first_6_water/perf_ll,last_12_boe,last_12_boe/perf_ll,last_12_gas,last_12_gas/perf_ll,last_12_gor,last_12_mmcfge,last_12_mmcfge/perf_ll,last_12_oil,last_12_oil/perf_ll,last_12_water,last_12_water/perf_ll,last_month_boe,last_month_boe/perf_ll,last_month_gas,last_month_gas/perf_ll,last_month_gor,last_month_mmcfge,last_month_mmcfge/perf_ll,last_month_oil,last_month_oil/perf_ll,last_month_water,last_month_water/perf_ll,months_produced,econ_scenario_&_combo,econ_run_date,wi_oil,nri_oil,before_income_tax_cash_flow,first_discount_cash_flow,econ_first_prod_date,undisc_roi,irr,payout_duration,oil_break_even,gas_break_even,oil_shrunk_eur,gas_shrunk_eur,ngl_eur,oil_shrunk_eur/pll,gas_shrunk_eur/pll,ngl_eur/pll,prms_reserves_category,prms_reserves_sub_category,created_at,updated_at
10880,ULLMAN UNIT 2H,2H,42177321290000,4217732129,,,,4217732129,,,,EF,,,,chosenID,,,2011-01-12 00:00:00,False,,GONZALES,EOG,,,,internal,internal,,,,,,,,,EAGLEVILLE,,,,,,,,2011-01-01 00:00:00,,2011-01-15 00:00:00,,,,,,,,,,,,,,,,,,,False,True,,,,H,,,,,api,2024-09-06 01:24:58.281000,,,INPT13gMDY1bU0,,,,2024-06-15 00:00:00,,,ULLMAN UNIT,01-015328-O,16281.0,,,,,,,,,,,5202.0,2010-10-14 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010-10-17 00:00:00,,TX,A,,29.286467,-97.427118,COGSWELL MARY,,,,,29.27688,-97.415019,,,,,,,,,,,,,,,2021_12_EF_TC_Area_10,11749.0,,,OIL,External API Import - 09/06/2024 01:24:58 AM,LEF,ACTUAL,01PDP,ACTUAL,,KMF PROD,,EF_19,IHS PROD,,EAGLE FORD AOI,,,,,,,,,,,,,,,100.0,136.0,19.0,0.032676,,,,,,,,,,,,,,175760.641056,129320.87429,278638.600599,,,,,,,,,,,True,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-05-27 00:00:48.271000,2024-09-06 04:52:52.619000
10881,ULLMAN UNIT 4H,4H,42177321300000,4217732130,,,,4217732130,,,,EF,,,,chosenID,,,2011-01-09 00:00:00,False,,GONZALES,EOG,,,,internal,internal,,,,,,,,,EAGLEVILLE,,,,,,,,2011-01-01 00:00:00,,2011-01-15 00:00:00,,,,,,,,,,,,,,,,,,,False,True,,,,H,,,,,api,2024-09-06 01:24:58.281000,,,INPT9Er6cuTgxa,,,,2024-06-15 00:00:00,,,ULLMAN UNIT,01-015328-O,15742.0,,,,,,,,,,,5399.0,2010-10-19 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010-11-09 00:00:00,,TX,A,,29.291523,-97.422518,VONROEDER OTTO,,,,,29.282332,-97.409221,,,,,,,,,,,,,,,2021_12_EF_TC_Area_10,11822.0,,,OIL,External API Import - 09/06/2024 01:24:58 AM,EF,ACTUAL,01PDP,ACTUAL,,KMF PROD,,EF_19,IHS PROD,,EAGLE FORD AOI,,,,,,,,,,,,,,,100.0,161.0,19.0,0.032676,,,,,,,,,,,,,,150478.220405,121342.228875,174815.949181,,,,,,,,,,,True,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-05-27 00:00:48.272000,2024-09-06 04:52:52.619000
11129,ULLMAN UNIT 3H,3H,42177321590000,4217732159,,,,4217732159,,,,EF,,,,chosenID,,,2011-07-01 00:00:00,False,,GONZALES,EOG,,,,internal,internal,,,,,,,,,EAGLEVILLE,,,,,,,,2011-06-01 00:00:00,,2011-06-15 00:00:00,,,,,,,,,,,,,,,,,,,False,True,,,,H,,,,,api,2024-09-06 01:24:58.281000,,,INPThYNNpB5Jsi,,,,2024-06-15 00:00:00,,,ULLMAN UNIT,01-015328-O,16497.0,,,,,,,,,,,5481.0,2010-12-21 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-04-01 00:00:00,,TX,A,,29.287681,-97.425265,COGSWELL MARY,,,,,29.278126,-97.411972,,,,,,,,,,,,,,,2021_12_EF_TC_Area_10,12551.0,,,OIL,External API Import - 09/06/2024 01:24:58 AM,LEF,ACTUAL,01PDP,ACTUAL,,KMF PROD,,EF_19,IHS PROD,,EAGLE FORD AOI,,,,,,,,,,,,,,,100.0,156.0,19.0,0.032676,,,,,,,,,,,,,,219148.567602,173477.564188,274026.020481,,,,,,,,,,,True,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-05-27 00:02:06.838000,2024-09-06 04:52:52.620000
11783,ULLMAN UNIT 1H,1H,42177321520000,4217732152,,,,4217732152,,,,EF,,,,chosenID,,,2011-06-27 00:00:00,False,,GONZALES,EOG,,,,internal,internal,,,,,,,,,EAGLEVILLE,,,,,,,,2011-06-01 00:00:00,,2011-06-15 00:00:00,,,,,,,,,,,,,,,,,,,False,True,,,,H,,,,,api,2024-09-06 01:24:58.281000,,,INPT4HVQyRPxpT,,,,2024-06-15 00:00:00,,,ULLMAN UNIT,01-015328-O,16906.0,,,,,,,,,,,5702.0,2010-12-17 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-04-28 00:00:00,,TX,A,,29.28292,-97.429788,COGSWELL MARY,,,,,29.273148,-97.415804,,,,,,,,,,,,,,,2021_12_EF_TC_Area_10,11585.0,,,OIL,External API Import - 09/06/2024 01:24:58 AM,LEF,ACTUAL,01PDP,ACTUAL,,KMF PROD,,EF_19,IHS PROD,,EAGLE FORD AOI,,,,,,,,,,,,,,,100.0,156.0,19.0,0.032676,,,,,,,,,,,,,,233789.775545,194971.100437,232912.050648,,,,,,,,,,,True,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-05-27 00:15:47.448000,2024-09-06 04:52:52.619000


In [25]:
df_wellheader_modified.groupby(['lease_number'])['first_prod_date'].min().to_frame().reset_index().head(5)

Unnamed: 0,lease_number,first_prod_date
0,01-015328-O,2011-01-01 00:00:00
1,01-015349-O,2011-02-01 00:00:00
2,01-015702-O,2011-04-01 00:00:00
3,01-015746-O,2012-03-01 00:00:00
4,01-015886-O,2012-04-01 00:00:00
