In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from io import StringIO
import time
# pd.set_option('display.max_rows', None)

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
base_url = 'https://webapps2.rrc.texas.gov/EWA/'
query_url = base_url + 'drillingPermitsQueryAction.do'

In [3]:
df_api = pd.read_excel(r'C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Scraping\RRC\notebook\Well_List_11_1_2023.xlsx')
# api_list = [4212335341,4238940705,4238940783,4217730498] #No Ammendments, Ammnedment filed no approval date, one ammendment, more than 3 ammendemnts

#### First Try

In [4]:
def get_headers():

    headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'webapps2.rrc.texas.gov',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'sec-ch-ua': "Google Chrome;v='119', 'Chromium';v='119', 'Not?A_Brand';v='24'",
    'sec-ch-ua-mobile': "?0",
    'sec-ch-ua-platform': "Windows"
    }
    
    return headers


def payload(api):
         
    """      
    Parameters
    ----------
    api : TYPE, int
        DESCRIPTION. 10 digit API. Strips API of initial '42'.

    Returns
    -------
    Dict.
    """
    api = int(str(api)[2:])

    payload = {
    'methodToCall': 'search',
    'searchArgs.apiNoHndlr.inputValue': api,
    'searchArgs.operatorNameWildcardHndlr.inputValue': 'beginsWith',
    'searchArgs.leaseNameWildcardHndlr.inputValue': 'beginsWith',
    'searchArgs.fieldNameWildcardHndlr.inputValue': 'beginsWith',
    'searchArgs.surveyNameWildcardHndlr.inputValue': 'beginsWith'
    }

    return payload


def get_data_DrillingPermitQuery(req_url,headers,payload):
    """      
    Parameters
    ----------
    post_response : TYPE, Requests Response
        DESCRIPTION. .

    Returns
    -------
    Request Response.
    """
    try:
        with requests.session() as s:
            s.verify = False
            res_get = s.get(url=req_url,headers=headers,verify=False)

            if res_get.status_code == 200:
                cookies = res_get.cookies
                res_post = s.post(url=query_url,data=payload,cookies=cookies,headers=headers,verify=False)

                return res_post

    except Exception as e:
        print(e)


def parse_query_data(post_resp):
    """      
    Parameters
    ----------
    post_response : TYPE, Requests Response
        DESCRIPTION. Response from the get_data_DrillingPermitQuery()

    Returns
    -------
    DataFrame.
    """
    soup = BeautifulSoup(post_resp.content,'html.parser')
    web_links = soup.select('a')
    href = [base_url + web_link['href'] for web_link in web_links if 'drillingPermitDetailAction' in web_link['href']]

    df_main = pd.read_html(StringIO(post_resp.text))[9]
    df_main.columns = df_main.iloc[1,:]
    df_main = df_main.drop([0,1]).copy()
    df_main = df_main.reset_index(drop=True).copy()

    df_edit = df_main.copy()
    df_edit['API NO.'] = int(df_main['API NO.'].str.split()[0][0])
    df_edit['HREF'] = href
    df_edit[['Submitted_Dt', 'Approved_Dt']] = df_edit['Status Date'].str.split(expand=True).loc[:,[1,3]]
    df_edit['Submitted_Dt'] = pd.to_datetime(df_edit['Submitted_Dt'])
    df_edit['Approved_Dt'] = pd.to_datetime(df_edit['Approved_Dt'])

    apprvd_Dt_max_idx = df_edit.index[df_edit['Approved_Dt']==df_edit['Approved_Dt'].max()]
    max_apprvd_Dt = df_edit['Approved_Dt'].max()
    href_link = df_edit.loc[apprvd_Dt_max_idx,'HREF'].item()

    return df_main, df_edit, href_link, max_apprvd_Dt, apprvd_Dt_max_idx.item()


def get_data_W1_Form(url_href):
    """      
    Parameters
    ----------
    url_href : TYPE, Str
        DESCRIPTION. URL from parse_query_data()

    Returns
    -------
    Response.
    """

    with requests.session() as s:
        s.verify = False
        res_get = s.get(url=url_href,headers=get_headers(),verify=False)
    
    return res_get


def parse_W1_Form(get_resp):
    """      
    Parameters
    ----------
    get_resp : TYPE, Requests Response
        DESCRIPTION. get response from get_data_W1_Form()

    Returns
    -------
    DataFrame
    """
    soup = BeautifulSoup(get_resp.text,'html.parser')
    tables = soup.find_all('table',class_='GroupBox1')

    data_dict = {}

    for count,table in enumerate(tables):
        if (table.find_all('th') is not None):
            for i,(th,td) in enumerate(zip(table.find_all('th'),table.find_all('td'))):
                if th.text == 'Horizontal Wellbore':
                    data_dict[th.text] = td.text
                elif th.text == 'Acres':
                    data_dict[th.text] = td.text

    return pd.DataFrame.from_dict(data_dict, orient='index').T

#### Main

In [5]:
api_list = df_api['API_10'].tolist()
# api_list = df_api['API_10'].loc[[5]].tolist()

dfs_main = []
dfs_edit = []

# Define Headers
headers = get_headers()

for idx, api in enumerate(api_list):

    payload_api = payload(api=api)

    response_from_DrillingPermit_Query = get_data_DrillingPermitQuery(req_url=query_url,headers=headers,payload=payload_api)

    df_mainPage, df_edit, w1_link, Approved_Dt, idx_Max_Aprvd_Dt = parse_query_data(post_resp=response_from_DrillingPermit_Query)

    response_from_W1_query = get_data_W1_Form(url_href=w1_link)

    df_w1 = parse_W1_Form(get_resp=response_from_W1_query)
    df_w1['Approved_Dt'] = Approved_Dt

    df_edit_merge_with_w1 = df_edit.merge(df_w1,on='Approved_Dt',how='left').reset_index(drop=True)

    dfs_edit.append(df_edit_merge_with_w1)
    dfs_main.append(df_mainPage)

    # time.sleep(1)

In [6]:
pd.concat(dfs_edit).reset_index(drop=True)

Unnamed: 0,API NO.,District,Lease,Well Number,Permitted Operator,County,Status Date,Status Number,Wellbore Profiles,Filing Purpose,Amend,Total Depth,Stacked Lateral Parent Well DP #,Status,HREF,Submitted_Dt,Approved_Dt,Horizontal Wellbore,Acres
0,12335324,02,H. MUELLER 18A,8H,BPX OPERATING COMPANY(085408),DE WITT,Submitted: 10/19/2023 Approved: 10/30/2023,894674,Horizontal,New Drill,N,14000,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-10-19,2023-10-30,,371.03
1,12335325,02,H. MUELLER 18A,9H,BPX OPERATING COMPANY(085408),DE WITT,Submitted: 10/19/2023 Approved: 10/30/2023,894673,Horizontal,New Drill,N,14000,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-10-19,2023-10-30,,371.03
2,12335326,02,H. MUELLER 18A,10H,BPX OPERATING COMPANY(085408),DE WITT,Submitted: 10/19/2023 Approved: 10/30/2023,894675,Horizontal,New Drill,N,14000,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-10-19,2023-10-30,,371.03
3,12335331,02,MARON-RESP-VASB ULW,1,BURLINGTON RESOURCES O & G CO LP(109333),DE WITT,Submitted: 10/26/2023 Approved: 11/03/2023,894844,Horizontal,New Drill,N,17500,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-10-26,2023-11-03,PSA,1050.31
4,12335332,02,MARON-VASB USW A,1,BURLINGTON RESOURCES O & G CO LP(109333),DE WITT,Submitted: 10/26/2023 Approved: 11/03/2023,894845,Horizontal,New Drill,N,17500,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-10-26,2023-11-03,PSA,690.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,49534917,08,BIRDIE 2803D,11H,"CONTINENTAL RESOURCES, INC.(173777)",WINKLER,Submitted: 12/12/2023,894995,Horizontal,New Drill,Y,12800,,NOTICE OF APPLICATION,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-12-12,NaT,,
349,49534918,08,BISON,082H,"AMEREDEV OPERATING, LLC(019802)",WINKLER,Submitted: 11/03/2023 Approved: 11/14/2023,895073,Horizontal,New Drill,N,10430,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-11-03,2023-11-14,,496.28
350,49534919,08,BISON,081H,"AMEREDEV OPERATING, LLC(019802)",WINKLER,Submitted: 11/03/2023 Approved: 11/08/2023,895074,Horizontal,New Drill,N,10420,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-11-03,2023-11-08,,496.28
351,49534920,08,BISON,072H,"AMEREDEV OPERATING, LLC(019802)",WINKLER,Submitted: 11/03/2023 Approved: 11/08/2023,895075,Horizontal,New Drill,N,9830,,APPROVED,https://webapps2.rrc.texas.gov/EWA/drillingPer...,2023-11-03,2023-11-08,,496.28


In [7]:
pd.concat(dfs_main).reset_index(drop=True)

1,API NO.,District,Lease,Well Number,Permitted Operator,County,Status Date,Status Number,Wellbore Profiles,Filing Purpose,Amend,Total Depth,Stacked Lateral Parent Well DP #,Status
0,12335324 Links Images GIS Viewer Completion,02,H. MUELLER 18A,8H,BPX OPERATING COMPANY(085408),DE WITT,Submitted: 10/19/2023 Approved: 10/30/2023,894674,Horizontal,New Drill,N,14000,,APPROVED
1,12335325 Links Images GIS Viewer Completion,02,H. MUELLER 18A,9H,BPX OPERATING COMPANY(085408),DE WITT,Submitted: 10/19/2023 Approved: 10/30/2023,894673,Horizontal,New Drill,N,14000,,APPROVED
2,12335326 Links Images GIS Viewer Completion,02,H. MUELLER 18A,10H,BPX OPERATING COMPANY(085408),DE WITT,Submitted: 10/19/2023 Approved: 10/30/2023,894675,Horizontal,New Drill,N,14000,,APPROVED
3,12335331 Links Images GIS Viewer Completion,02,MARON-RESP-VASB ULW,1,BURLINGTON RESOURCES O & G CO LP(109333),DE WITT,Submitted: 10/26/2023 Approved: 11/03/2023,894844,Horizontal,New Drill,N,17500,,APPROVED
4,12335332 Links Images GIS Viewer Completion,02,MARON-VASB USW A,1,BURLINGTON RESOURCES O & G CO LP(109333),DE WITT,Submitted: 10/26/2023 Approved: 11/03/2023,894845,Horizontal,New Drill,N,17500,,APPROVED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,49534917 Links Images GIS Viewer Completion,08,BIRDIE 2803D,11H,"CONTINENTAL RESOURCES, INC.(173777)",WINKLER,Submitted: 12/12/2023,894995,Horizontal,New Drill,Y,12800,,NOTICE OF APPLICATION
349,49534918 Links Images GIS Viewer Completion,08,BISON,082H,"AMEREDEV OPERATING, LLC(019802)",WINKLER,Submitted: 11/03/2023 Approved: 11/14/2023,895073,Horizontal,New Drill,N,10430,,APPROVED
350,49534919 Links Images GIS Viewer Completion,08,BISON,081H,"AMEREDEV OPERATING, LLC(019802)",WINKLER,Submitted: 11/03/2023 Approved: 11/08/2023,895074,Horizontal,New Drill,N,10420,,APPROVED
351,49534920 Links Images GIS Viewer Completion,08,BISON,072H,"AMEREDEV OPERATING, LLC(019802)",WINKLER,Submitted: 11/03/2023 Approved: 11/08/2023,895075,Horizontal,New Drill,N,9830,,APPROVED


In [8]:
# pd.concat(dfs_edit).reset_index(drop=True).to_excel("Results_W1.xlsx",index=False)
# pd.concat(dfs_main).to_excel("Results_mainWebpage.xlsx",index=False)