In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from io import StringIO

import time
# pd.set_option('display.max_rows', None)

In [2]:
base_url = 'https://webapps2.rrc.texas.gov/EWA/'
query_url = base_url + 'drillingPermitsQueryAction.do'

In [3]:
df_api = pd.read_excel(r'C:\Users\Apoorva.Saxena\OneDrive - Sitio Royalties\Desktop\Project - Apoorva\Python\Scraping\RRC\notebook\Well_List_11_1_2023.xlsx')

In [4]:
df_api.head()

Unnamed: 0,API_10
0,4212335324
1,4212335325
2,4212335326
3,4212335331
4,4212335332


In [5]:
# api_list = [4212335341,4238940705,4238940783,4217730498] #No Ammendments, Ammnedment filed no approval date, one ammendment, more than 3 ammendemnts

#### First Try

In [6]:
def get_headers():

    headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'webapps2.rrc.texas.gov',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'sec-ch-ua': "Google Chrome;v='119', 'Chromium';v='119', 'Not?A_Brand';v='24'",
    'sec-ch-ua-mobile': "?0",
    'sec-ch-ua-platform': "Windows"
    }
    
    return headers


def payload(api):
         
    """      
    Parameters
    ----------
    api : TYPE, int
        DESCRIPTION. Strips API of initial '42'.

    Returns
    -------
    Dict.
    """
    api = int(str(api)[2:])

    payload = {
    'methodToCall': 'search',
    'searchArgs.apiNoHndlr.inputValue': api,
    'searchArgs.operatorNameWildcardHndlr.inputValue': 'beginsWith',
    'searchArgs.leaseNameWildcardHndlr.inputValue': 'beginsWith',
    'searchArgs.fieldNameWildcardHndlr.inputValue': 'beginsWith',
    'searchArgs.surveyNameWildcardHndlr.inputValue': 'beginsWith'
    }

    return payload


def get_data_DrillingPermitQuery(req_url,headers,payload):
    """      
    Parameters
    ----------
    post_response : TYPE, Requests Response
        DESCRIPTION. .

    Returns
    -------
    Request Response.
    """
    try:
        with requests.session() as s:
            res_get = s.get(url=req_url,headers=headers,verify=False)

            if res_get.status_code == 200:
                cookies = res_get.cookies
                res_post = s.post(url=query_url,data=payload,cookies=cookies,headers=headers)

                return res_post

    except Exception as e:
        print(e)


def parse_query_data(post_resp):
    """      
    Parameters
    ----------
    post_response : TYPE, Requests Response
        DESCRIPTION. Response from the get_data_DrillingPermitQuery()

    Returns
    -------
    DataFrame.
    """
    soup = BeautifulSoup(post_resp.content,'html.parser')
    web_links = soup.select('a')
    href = [base_url + web_link['href'] for web_link in web_links if 'drillingPermitDetailAction' in web_link['href']]

    df_main = pd.read_html(StringIO(post_resp.text))[9]
    df_main.columns = df_main.iloc[1,:]
    df_main = df_main.drop([0,1]).copy()
    df_main = df_main.reset_index(drop=True).copy()

    df_edit = df_main.copy()
    df_edit['API NO.'] = int(df_main['API NO.'].str.split()[0][0])
    df_edit['HREF'] = href
    df_edit[['Submitted_Dt', 'Approved_Dt']] = df_edit['Status Date'].str.split(expand=True).loc[:,[1,3]]
    df_edit['Submitted_Dt'] = pd.to_datetime(df_edit['Submitted_Dt'])
    df_edit['Approved_Dt'] = pd.to_datetime(df_edit['Approved_Dt'])

    apprvd_Dt_max_idx = df_edit.index[df_edit['Approved_Dt']==df_edit['Approved_Dt'].max()]
    max_apprvd_Dt = df_edit['Approved_Dt'].max()
    href_link = df_edit.loc[apprvd_Dt_max_idx,'HREF'].item()

    return df_main, df_edit, href_link, max_apprvd_Dt, apprvd_Dt_max_idx.item()


def get_data_W1_Form(url_href):
    """      
    Parameters
    ----------
    url_href : TYPE, Str
        DESCRIPTION. URL from parse_query_data()

    Returns
    -------
    Response.
    """

    with requests.session() as s:
        res_get = s.get(url=url_href,headers=get_headers(),verify=False)
    
    return res_get


def parse_W1_Form(get_resp):
    """      
    Parameters
    ----------
    get_resp : TYPE, Requests Response
        DESCRIPTION. get response from get_data_W1_Form()

    Returns
    -------
    DataFrame
    """
    soup = BeautifulSoup(get_resp.text,'html.parser')
    tables = soup.find_all('table',class_='GroupBox1')

    data_dict = {}

    for count,table in enumerate(tables):
        if (table.find_all('th') is not None):
            for i,(th,td) in enumerate(zip(table.find_all('th'),table.find_all('td'))):
                if th.text == 'Horizontal Wellbore':
                    data_dict[th.text] = td.text
                elif th.text == 'Acres':
                    data_dict[th.text] = td.text

    return pd.DataFrame.from_dict(data_dict, orient='index').T

#### Main

In [15]:
api_list = df_api['API_10'].tolist()

dfs_main = []
dfs_edit = []

# Define Headers
headers = get_headers()

for idx, api in enumerate(api_list):

    payload_api = payload(api=api)

    response_from_DrillingPermit_Query = get_data_DrillingPermitQuery(req_url=query_url,headers=headers,payload=payload_api)

    df_mainPage, df_edit, w1_link, Approved_Dt, idx_Max_Aprvd_Dt = parse_query_data(post_resp=response_from_DrillingPermit_Query)

    response_from_W1_query = get_data_W1_Form(url_href=w1_link)

    df_w1 = parse_W1_Form(get_resp=response_from_W1_query)
    df_w1['Approved_Dt'] = Approved_Dt

    df_edit_merge_with_w1 = df_edit.merge(df_w1,on='Approved_Dt',how='left').reset_index(drop=True)

    dfs_edit.append(df_edit_merge_with_w1)
    dfs_main.append(df_mainPage)

    time.sleep(2)



In [23]:
pd.concat(dfs_edit).reset_index(drop=True).to_excel("Results_W1.xlsx",index=False)

In [22]:
pd.concat(dfs_main).to_excel("Results_mainWebpage.xlsx",index=False)