In [17]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import json

# Setup ChromeDriver
chrome_options = Options()
# Additional options can be added if needed
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL to scrape
url = "https://www.readtangle.com/archive/"

# Navigate to the URL
driver.get(url)

# Wait for the page to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

# Click "Load more posts" button until all articles are loaded
while True:
    try:
        load_more_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.js-load-more'))
        )
        driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
        time.sleep(1)  # Let any animations finish
        load_more_button.click()
        time.sleep(2)  # Wait for more articles to load
    except (TimeoutException, NoSuchElementException, ElementNotInteractableException):
        print("No more 'Load more posts' button found or timeout reached.")
        break

articles_data = []

# Find all article links on the page
article_links = driver.find_elements(By.CSS_SELECTOR, 'article a')

# Function to scrape article data
def scrape_article_data(article_url):
    # Open new tab
    driver.execute_script("window.open('');")
    # Switch to the new tab
    driver.switch_to.window(driver.window_handles[1])
    driver.get(article_url)

    # Wait for the article to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

    # Scrape the article title and text sections here
    title = driver.find_element(By.CSS_SELECTOR, 'h1').text.strip()
    
    # Your existing function to get text after h3 until hr
    # Ensure get_texts_after_h3_until_hr function is defined outside the loop
    texts_by_section = {h3_id: get_texts_after_h3_until_hr(h3_id) for h3_id in h3_ids}

    # Close the current tab
    driver.close()
    # Switch back to the first tab
    driver.switch_to.window(driver.window_handles[0])

    return {"title": title, "texts_by_section": texts_by_section}

# IDs of the h3 elements to scrape texts for
h3_ids = [
    "todays-topic",
    "what-the-right-is-saying",
    "what-the-left-is-saying"
]

for link in article_links:
    article_url = link.get_attribute('href')
    article_data = scrape_article_data(article_url)
    articles_data.append(article_data)

    # No need to go back since we are not leaving the main page

# Process and save the scraped data
json_data = json.dumps(articles_data, indent=4)
print(json_data)

# Optionally, save to a file
# with open('articles_data.json', 'w') as f:
#     f.write(json_data)

# Close the driver
driver.quit()


No more 'Load more posts' button found or timeout reached.


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [22]:
with open('articles_data.json', 'w') as f:
    f.write(json_data)

In [52]:
# check we have all (most) of the expected titles (most recent & oldest few match up)
parsed_json = json.loads(json_data)

for e in parsed_json:
    print(e['title'])

Should we have a 32-hour workweek?
The SCOTUS social media case.
Chuck Schumer's rebuke of Netanyahu.
The Georgia election interference case.
The Sunday — March 17
How we plan to cover the 2024 election.
Robert Hur testifies to Congress.
Biden's 2025 budget proposal.
A "TikTok ban" is moving forward.
President Biden's State of the Union.
The Sunday — March 10
Your responses to 'The Zionist case for a ceasefire.'
Kyrsten Sinema retires.
Special edition: Super Tuesday results.
The Supreme Court reinstates Trump on Colorado's ballot.
SCOTUS will hear Trump's immunity case.
The Sunday — March 3
The Zionist case for a ceasefire.
The Michigan primary results.
Two years of war in Ukraine.
Trump wins South Carolina primary.
Alabama's IVF ruling.
The Sunday — February 25
How dangerous is porn, really?
The arrest of the Biden informant.
The Trump fraud ruling.
The Fani Willis testimony.
The Sunday — February 18
Joe Biden and Donald Trump are not the same
Democrats win New York's 3rd Congressiona

In [37]:
articles_data = json.loads(articles_data)

In [45]:
# Assuming `articles_data` is loaded from your JSON file
# with open('your_file.json', 'r') as f:
#     articles_data = json.load(f)

labeled_data = []

for article in articles_data:
    for section_key, paragraphs in article["texts_by_section"].items():
        # Combine all paragraphs into a single text, ensuring it reads smoothly.
        combined_text = " ".join(paragraphs).replace("\\u2019", "'").replace("\\u201c", "\"").replace("\\u201d", "\"")
        
        # Assign labels based on section_key
        if section_key == "todays-topic":
            label = "neutral"
        elif section_key == "what-the-right-is-saying":
            label = "right-leaning"
        elif section_key == "what-the-left-is-saying":
            label = "left-leaning"
        
        # Append combined text and label to labeled_data
        labeled_data.append({"text": combined_text, "label": label})

# Now, labeled_data is ready to be used for training. It's also clean and consolidated.
# Optionally, you can write this out to a file for inspection or further processing.
with open('cleaned_labeled_data.json', 'w') as file:
    json.dump(labeled_data, file, indent=4)


### Further Steps for Neural Network Training:

*   **Tokenization and Encoding**: Convert the text into a format understandable by the network, usually involving converting text to sequences of integers representing tokens or words.
    
*   **Splitting Data**: Divide your data into training, validation, and test sets to evaluate the performance of your model accurately.
    
*   **Neural Network Architecture**: Design your neural network architecture. For sentiment analysis, recurrent neural networks (RNNs) or transformers are common choices due to their effectiveness in handling sequential data like text.
    
*   **Training**: Train your neural network on the processed and labeled data.