In [15]:
# import packages that we need in action
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, WebDriverException, StaleElementReferenceException
import time
from selenium.webdriver.common.by import By
import math
import csv
import os

In [17]:
# variables in our app
url = "https://www.tjk.org/TR/YarisSever/Query/Page/AtIstatistikleri"
number_of_clicks = 50 # for each epoch, we will click the show-more button for number_of_clicks
# Save data to CSV file for each epoch
csv_filename = "all_data_in_epoch_no_"
number_of_rows_in_page = 50 # this is almost static
last_page_value = "1"  # Starting with "1" as the initial page number

In [19]:
def create_selenium_driver():
    # Set up Firefox options
    options = Options()
    options.headless = False
    options.binary_location = 'C:\\Program Files\\Mozilla Firefox\\firefox.exe'  # Adjust the path if necessary
    options.add_argument("--start-maximized")  # To start browser maximized

    # Specify the path to geckodriver
    service = Service('C:\\Program Files\\GeckoDriver\\geckodriver.exe')  # Adjust the path if necessary

    # Create the WebDriver instance and return it
    driver = webdriver.Firefox(service=service, options=options)
    return driver


In [21]:
def get_number_of_epochs(driver,url,number_of_clicks,rows_per_page):

    # Open the web page
    driver.get(url)
    
    # Wait for the page to load completely
    driver.implicitly_wait(10)
    
    # wait for 10 secs to execute the next script
    time.sleep(5)

    # Use XPath to find the innermost div inside the td in tr with class 'hidable'
    xpath = "//table[@id='queryTable'][@class='tablesorter']//tbody//tr[@class='hidable']//td//div//div"
    inner_div = driver.find_element(By.XPATH, xpath)
    
    # Get and print the text content of the div
    text_content = inner_div.text
    print("Original text content:", text_content)
    
    # Split the text into words
    words = text_content.split()
    
    # Check if there are at least two words
    if len(words) > 1:
        # Take the second word
        second_word = words[1]
        
        # Convert the second word to integer
        try:
            total_tuples = int(second_word)
            print("Total number of tuples:", total_tuples)
            
            # Calculate the total number of pages
            total_pages = math.ceil(total_tuples / rows_per_page)
            print("Total number of pages:", total_pages)
            
        except ValueError:
            print("The second word is not a valid integer.")
            return None  # Early return in case of error
    else:
        print("The text does not contain enough words.")
        return None  # Early return if text format is incorrect
    
    # find the number of epochs
    epoch_size = total_pages // number_of_clicks
    
    # find if the last page has to be processed after all epochs done
    last_page = 1 if total_pages % number_of_clicks != 0 else 0

    return (epoch_size, last_page)

In [23]:
def click_show_more(driver, number_of_clicks):
    """
    Clicks the 'Show More' button repeatedly to load more rows.
    Parameters:
    - driver: The Selenium WebDriver instance.
    - number_of_clicks: Number of times to click the 'Show More' button.
    """
    wait = WebDriverWait(driver, 10)  # Adjust the timeout as needed
    
    for i in range(number_of_clicks):
        try:
            # Use XPath to locate the "show-more" button
            show_more_button_xpath = "//table[@id='queryTable'][@class='tablesorter']//tbody[last()]//tr[@class='hidable']//td//div//form//button[@class='show-more']"
            
            # Wait until the button is clickable
            show_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, show_more_button_xpath)))
                      
            # Click the button
            show_more_button.click()
            
            # Wait a short while for the new rows to load
            time.sleep(10)  # Adjust this time if needed based on loading speed
            
            print(f"Clicked {i+1} times")
            
        except ElementClickInterceptedException:
            print("The last 'Show More' button was intercepted by another element. Perhaps an overlay is blocking it?")
            break
        except NoSuchElementException:
            print("Could not find the last 'Show More' button after it became clickable. It may have been removed from the DOM.")
            break
        except StaleElementReferenceException:
            print("The 'Show More' button reference became stale. The page might have been refreshed or updated.")
            break
        except WebDriverException as e:
            print(f"WebDriver encountered an error while trying to click the last 'Show More' button: {e}")
            break

In [25]:
def extract_and_save_table_data(driver, filename):
    """
    Extracts data from the query table and saves it to a CSV file.

    Parameters:
    - driver: The Selenium WebDriver instance.
    - filename: The name of the file where data will be saved.
    """
    # Extract headers
    headers = []
    try:
        header_elements = driver.find_elements(By.XPATH, "//table[@id='queryTable'][@class='tablesorter']//thead//tr//th")
        for header in header_elements:
            headers.append(header.text)
        headers = ["at_href", "at_id"] + headers  # Insert new headers at the beginning
    except Exception as e:
        print(f"Error extracting headers: {e}")

    # Extract href values and contents from rows
    data = []
    try:
        tbody_elements = driver.find_elements(By.XPATH, "//table[@id='queryTable'][@class='tablesorter']//tbody")
        for tbody in tbody_elements:
            rows = tbody.find_elements(By.XPATH, ".//tr[contains(@class, 'even') or contains(@class, 'odd')]")
            for row in rows:
                try:
                    # Extract href and sliced part
                    anchor_xpath = ".//td[1]//a"
                    anchor_tag = row.find_element(By.XPATH, anchor_xpath)
                    href_value = anchor_tag.get_attribute("href")
                    last_part = href_value.split('=')[-1]
                    
                    # Extract other td contents
                    row_data = [href_value, last_part]
                    td_elements = row.find_elements(By.XPATH, ".//td")
                    for td in td_elements:
                        row_data.append(td.text)
                    
                    data.append(row_data)
                except Exception as e:
                    print(f"Error processing row: {e}")
    except Exception as e:
        print(f"Error during extraction: {e}")

    # Save data to CSV file
    try:
        with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
            writer = csv.writer(file)
            writer.writerow(headers)  # Write headers
            writer.writerows(data)    # Write data rows
        
    except Exception as e:
        print(f"Error saving data to CSV: {e}")

    print(f"Data has been saved to '{filename}'.")

In [None]:
# main function

# find epoch size
driver = create_selenium_driver()
epoch_size, last_page = get_number_of_epochs(driver,url,number_of_clicks,number_of_rows_in_page)
print(f'{epoch_size}, {last_page}')

for epoch in range(epoch_size): # change epoch_size for a constant test
    
    # Record the start time
    start_time = time.time()
    
    driver = create_selenium_driver()
    
    driver.get(url)
    # Wait for the page to load completely
    driver.implicitly_wait(5)
    # wait for 20 secs to execute the next script
    time.sleep(5)

    # Click the header (anchor tag) with name="AtAdi" to sort the data
    try:
        # Use XPath to locate the "AtAdi" header anchor tag
        header_xpath = "//table[@id='queryTable'][@class='tablesorter']//thead//tr//th//a[@name='AtAdi']"
        header_link = driver.find_element(By.XPATH, header_xpath)
        
        # Click the header link to trigger the sorting mechanism
        header_link.click()
        
        # Wait for a short time for the page to refresh and sort the data
        time.sleep(5)  # Adjust this time if needed based on page load speed
        print("Clicked the 'AtAdi' header for sorting.")
    except Exception as e:
        print(f"Error clicking the header for sorting: {e}")
        
    # Locate the input element with name="PageNumber"
    page_number_input = driver.find_element(By.XPATH, "//table[@id='queryTable'][@class='tablesorter']//tbody//tr[@class='hidable']//td//div//form//input[@name='PageNumber']")

    # Update its value attribute
    driver.execute_script(f"arguments[0].setAttribute('value', '{last_page_value}')", page_number_input)
    time.sleep(5)  # Adjust this time if needed based on page load speed

    # click show-more button for sevceral times to load the batch of data in the same page
    click_show_more(driver, number_of_clicks)
    time.sleep(5)
    
    # Find the input element with name 'PageNumber'
    page_number_input_xpath = "//table[@id='queryTable'][@class='tablesorter']//tbody[last()]//tr[@class='hidable']//td//div//form//input[@name='PageNumber']"
    page_number_input = driver.find_element(By.XPATH, page_number_input_xpath)

    # Get the value attribute of the input element
    last_page_value = page_number_input.get_attribute("value")

    # Print the last page value for debugging
    print(f"Last page value: {last_page_value}")
    
    # Define a unique filename for each batch, her 100 batch için csv dosyası oluşturup saklayacağım.
    filename = f'{csv_filename}_{epoch + 1}.csv'
    
    # Extract and save table data HER BATCH İÇİN SAVE OLAYI
    extract_and_save_table_data(driver, filename)
    print(f"Epoch Number : {epoch + 1} of {epoch_size} done, check out the csv file @ Users_emerg")
    
    # Close the WebDriver
    driver.quit()
    
    # Record the stop time
    stop_time = time.time()
    
    # Compute the elapsed time in seconds
    elapsed_time = stop_time - start_time
    
    # Print the elapsed time for debugging
    print(f"Elapsed time for this epoch: {elapsed_time} seconds")

Original text content: Toplam 64614 sonuçtan 50 tanesi gösteriliyor
Total number of tuples: 64614
Total number of pages: 1293
25, 1
Clicked the 'AtAdi' header for sorting.
Clicked 1 times
Clicked 2 times
Clicked 3 times
Clicked 4 times
Clicked 5 times
Clicked 6 times
Clicked 7 times
Clicked 8 times
Clicked 9 times
Clicked 10 times
Clicked 11 times
Clicked 12 times
Clicked 13 times
Clicked 14 times
Clicked 15 times
Clicked 16 times
Clicked 17 times
Clicked 18 times
Clicked 19 times
Clicked 20 times
Clicked 21 times
Clicked 22 times
Clicked 23 times
Clicked 24 times
Clicked 25 times
Clicked 26 times
Clicked 27 times
Clicked 28 times
Clicked 29 times
Clicked 30 times
Clicked 31 times
Clicked 32 times
Clicked 33 times
Clicked 34 times
Clicked 35 times
Clicked 36 times
Clicked 37 times
Clicked 38 times
Clicked 39 times
Clicked 40 times
Clicked 41 times
Clicked 42 times
Clicked 43 times
Clicked 44 times
Clicked 45 times
Clicked 46 times
Clicked 47 times
Clicked 48 times
Clicked 49 times
Clic