In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import asyncio
import time
from concurrent.futures import ThreadPoolExecutor
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

def setup_driver(silent=False):
    options = Options()
    if silent:
        options.add_argument("--headless")
    service = Service()
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [2]:
import requests
from bs4 import BeautifulSoup
import asyncio

In [3]:
def submission_url(id :int):
    return f"https://programming.in.th/submissions/{id}"

In [4]:
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By

def code_elementline_to_codeline(element: WebElement) -> str:
    spans = element.find_elements(By.XPATH, './/span')
    return ''.join(span.text for span in spans)

def combined_code_element(elements: list[WebElement]) -> str:
    return '\n'.join(map(code_elementline_to_codeline, elements))

In [5]:
def extract_data_from_html(html: str, submission_id: int) -> dict:
    soup = BeautifulSoup(html, 'html.parser')
    score_tag = soup.find('p', class_='mt-2 text-xs text-gray-500 dark:text-gray-200')
    score = score_tag.text.strip() if score_tag else ""
    status_tags = soup.find_all('p', class_='text-center text-sm font-medium text-gray-500 dark:text-white')
    status = status_tags[0].text.strip() if len(status_tags) > 0 else ""
    language = status_tags[1].text.strip() if len(status_tags) > 1 else ""
    problem_tag = soup.find('p', class_='mt-1 h-5 w-full font-light dark:text-white')
    problem_id = problem_tag.text.strip() if problem_tag else ""
    username = soup.find_all('p',class_="truncate text-center text-sm font-medium text-gray-500 dark:text-white")[0].text
    code_lines = []
    for line in soup.find_all('tr', class_='token-line'):
        spans = line.find_all('span')
        code_line = ''.join(span.get_text() for span in spans)
        code_lines.append(code_line)
    code_text = '\n'.join(code_lines)

    return {
        'User': username,
        'SubmissionID': submission_id,
        'ProblemURL': problem_id,
        'Score': score,
        'Language': language,
        'Source': code_text
    }

In [6]:
from rich import print

async def read_submission(ids: list[int]):
    driver = setup_driver(True)
    def blocking_scrape():
        df = pd.DataFrame(columns=['SubmissionID', 'ProblemURL', 'Score', 'Language', 'Source'])
        print(f"Start from {ids[0]} to {ids[-1]}")
        for i, submission_id in enumerate(ids):
            try:
                driver.get(submission_url(submission_id))
                time.sleep(0.5)
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, '//tr[@class="token-line"]'))
                )
                html = driver.page_source
                row = extract_data_from_html(html, submission_id)

                if submission_id % 100 == 0:
                    print(submission_id,end=' ')

                df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

            except Exception as e:
                print(f"[red]Error at ID {submission_id}:[/red]",end=' ')
                continue

        driver.quit()
        return df
    return await asyncio.to_thread(blocking_scrape)

In [11]:
def get_batches(start_id,end_id):
    end_id = min(344951,end_id)
    num_batches = 5
    batch_size = (end_id - start_id + 1) // num_batches
    batches = [
        list(range(start_id + i * batch_size, start_id + (i + 1) * batch_size))
        for i in range(num_batches)
    ]
    batches[-1] = list(range(start_id + (num_batches - 1) * batch_size, end_id + 1))
    return batches, end_id+1

In [None]:
from rich import print
from IPython.display import clear_output

# start_batches = [[130000,133000],[133001,136000],[165000,167000],[167001,169000],[170000,173000],[173001,175000],[175001,177500],[208000,210000]]
# start_batches = [[210001,213000],[213001,216000],[216001,219001],[50000,53000],[295000,298000],[53001,56001],[300000,330000],[220001,223000],[223001,226001]]
# [1000,5000],[5001,8000],[8001,10000],[10001,12000],[335000,339000],
start_batches = [[339001,341000],[344000,344951],[100000,103000],[103001,106000],[177501,177950],[266000,269000],[275000,769000]]
cnt = 23
print("test")
batch_size = 2000
for nbatch in start_batches[0:]:
    clear_output(True)
    print(f"Start new batches : {nbatch[0]} to {nbatch[1]}")
    batches, cur_id = get_batches(nbatch[0],nbatch[1])
    tasks = [read_submission(batch) for batch in batches]
    results = await asyncio.gather(*tasks)
    final_df = pd.concat(results, ignore_index=True)
    final_df.to_csv(f'./Raw/Programming/programmingin_{cnt}.csv',index=False)
    clear_output(True)
    print(f"Saved to ./Raw/Programming/programmingin_{cnt}.csv")
    time.sleep(120)
    cnt += 1
    if cnt % 2 == 0:
        print("Sleep for 3 minutes")
        time.sleep(180)