In [None]:
import logging
import pandas as pd
import tkinter as tk
from tkinter import filedialog
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time
import os
import openpyxl
import random

# Create a Tkinter root window
root = tk.Tk()

# Bring the root window to the top, then hide it
root.attributes("-topmost", True)
root.withdraw()

# Open a file dialog to select the input Excel file
excel_file = filedialog.askopenfilename(title="Select the input Excel file",
                                        filetypes=[("Excel files", "*.xlsx")])
if not excel_file:
    print("No file selected, exiting.")
    exit()

# Open a folder dialog to select the folder where the data should be saved
output_folder = filedialog.askdirectory(title="Select the folder to save the data")
if not output_folder:
    print("No folder selected, exiting.")
    exit()

root.destroy()  # Destroy the Tkinter root window

def perform_search(search_query, page_headers):
    delay = random.uniform(12,20)
    # open the URL in the browser
    driver.get('https://patentcenter.uspto.gov/')

    search_type = "Application #"  # Change this to either "Application #" or "Patents #"

    # Wait for the search dropdown button element to become available
    try:
        search_dropdown_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '//button[@class="btn btn-primary dropdown-toggle"]'))
        )
        search_dropdown_button.click()
    except TimeoutException:
        print("Timeout: Search dropdown button not found")
        driver.quit()

    # Find the search type element and click it
    try:
        search_type_element = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, f'//a[contains(@class, "dropdown-item") and text()="{search_type}"]'))
        )
        search_type_element.click()
    except TimeoutException:
        print("Timeout: Search type element not found")
        driver.quit()

    # Wait for the search input element to become available
    try:
        search_input = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.ID, 'TxtBox_bibData_search_input'))
        )
        search_input.clear()
        search_input.send_keys(search_query)
    except TimeoutException:
        print("Timeout: Search input element not found")
        driver.quit()

    # Find the search button element by its class name and click it
    try:
        search_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'showSearch-btn'))
        )
        search_button.click()
    except TimeoutException:
        print("Timeout: Search button not found")
        driver.quit()

    time.sleep(delay)

    # Save the first page as an HTML file
    save_page(page_headers[0])

    time.sleep(delay)
    # XPath for the next four pages
    xpaths = [
        '//*[@id="link_app-data-continuity"]',
        '//*[@id="link_app-data-foriegn-priority"]',
        '//*[@id="LeftNavLinks"]/li[8]',
        '//*[@id="link_app-data-assignments"]'
    ]

    # Save the next four pages
    for idx, xpath in enumerate(xpaths):
        try:
            next_page_element = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, xpath))
            )
            next_page_element.click()
            time.sleep(delay)
            save_page(page_headers[idx + 1])
            time.sleep(delay)
        except TimeoutException:
            print(f"Timeout: Page {idx + 2} element not found")
    
def save_page(page_name):
    file_location = output_folder
    file_name = f"{page_name}.html"
    file_path = os.path.join(file_location, file_name)

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(driver.page_source)

chrome_path = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.binary_location = chrome_path

# create a new Chrome browser instance
driver = webdriver.Chrome(options=chrome_options)

# Read the Excel sheet
wb = openpyxl.load_workbook(excel_file)
sheet = wb.active

# Loop through the rows in the Excel sheet (skipping the header row)
for row in range(2, sheet.max_row + 1):
    search_query = sheet.cell(row=row, column=1).value
    page_headers = [sheet.cell(row=row, column=i).value for i in range(2, 7)]

    # Perform the search and save the webpage for the current search query
    perform_search(search_query, page_headers)

# Close the browser
driver.quit()

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog

# Create a Tkinter root window
root = tk.Tk()

# Bring the root window to the top, then hide it
root.attributes("-topmost", True)
root.withdraw()

# Open a file dialog to select the input Excel file
url_file_path = filedialog.askopenfilename(title="Select the input Excel file",
                                           filetypes=[("Excel files", "*.xlsx")])
if not url_file_path:
    print("No file selected, exiting.")
    exit()

# Open a folder dialog to select the folder containing the HTML files
html_folder_path = filedialog.askdirectory(title="Select the folder containing the HTML files")
if not html_folder_path:
    print("No folder selected, exiting.")
    exit()

# Determine the output folder from the input file
output_folder = os.path.dirname(url_file_path)

root.destroy()  # Destroy the Tkinter root window

# Create an empty dataframe to store the parsed data
column_names = []
for page_name in ['Application', 'Continuity', 'Priority', 'Attorney', 'Assignment']:
    for suffix in ['Name', 'Data', 'Status']:
        column_names.append(f"{page_name}_{suffix}")

df_data = pd.DataFrame(columns=column_names)

# Define the columns to be used in the loop
columns = ['{Application}', '{Continuity}', '{Priority}', '{Attorney}', '{Assignment}']

# Loop through the directories containing the HTML files and parse the HTML
for i, row in pd.read_excel(url_file_path).iterrows():
    data_row = {}
    for column in columns:
        name = row[column]
        file_path = os.path.join(html_folder_path, f'{name}.html')
        status = 'Success'  # set default status as success

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                html = f.read()

            soup = BeautifulSoup(html, 'html.parser')

            text = soup.get_text(separator='\n')
            text = text.replace('<div>', '<div> ')
            text = text.replace('\n', ' \n')
        except Exception as e:
            print(f"Error occurred while processing the file for {name}: {e}")
            text = ''
            status = 'Error'  # set status as error if there's an exception

        column_prefix = column.strip('{}')
        data_row[f"{column_prefix}_Name"] = name
        data_row[f"{column_prefix}_Data"] = text
        data_row[f"{column_prefix}_Status"] = status

    df_data = pd.concat([df_data, pd.DataFrame(data_row, index=[0])])

# Write the parsed data to an Excel file in the same directory as the input file
output_file_path = os.path.join(output_folder, 'parsed_data.xlsx')
df_data.to_excel(output_file_path, index=False)
