In [None]:
# noema urls

import requests
from bs4 import BeautifulSoup

# Step 1: Get the sitemap content
url = 'https://www.noemamag.com/wpm-article-sitemap.xml'

# Add headers to make the request look like it's coming from a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    # Step 2: Parse the XML using BeautifulSoup with the 'xml' parser
    soup = BeautifulSoup(response.content, 'xml')
    
    # Step 3: Find all <loc> tags in the XML and get their text
    links = [loc.text for loc in soup.find_all('loc')]
    
    # Step 4: Write the URLs to a CSV file
    with open('noema_all_urls.csv', 'w') as f:
        for link in links:
            f.write(link + '\n')

    print(f"{len(links)} URLs successfully written to 'noema_all_urls.csv'")
else:
    print("Failed to retrieve the sitemap. Status Code:", response.status_code)


In [None]:
## zora urls

import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# Set up the Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run Chrome in headless mode for better performance
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize the WebDriver
driver = webdriver.Chrome(options=options)

try:
    # Open the website
    driver.get('https://zine.zora.co/')

    # Wait for the main feed posts container to load completely
    wait = WebDriverWait(driver, 10)
    feed_posts_locator = (By.CLASS_NAME, 'Feed_posts__X6dC4')  # Use the class name for the container
    wait.until(EC.presence_of_element_located(feed_posts_locator))

    # Click the "more articles" button until it's no longer available
    while True:
        try:
            # Wait for and click the "more articles" button
            more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/div/div[2]/section[3]/footer/button')))
            more_button.click()
            time.sleep(2)  # Short delay to allow the articles to load
        except:
            # Break the loop if the button is no longer found or clickable
            break

    # Get the full page source after loading all articles
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extract all article URLs from the feed posts container
    article_links = []
    feed_posts_section = soup.find('div', class_='Feed_posts__X6dC4')

    if feed_posts_section:
        for article in feed_posts_section.find_all('a', href=True):
            href = article['href']
            # Convert relative URLs to absolute URLs if necessary
            if href.startswith('/'):
                href = f'https://zine.zora.co{href}'
            if href not in article_links:
                article_links.append(href)

    # Save the extracted URLs to a CSV file
    with open('zora_zine_urls.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Article URL'])
        for link in article_links:
            writer.writerow([link])

    # Print the extracted URLs
    for link in article_links:
        print(link)

finally:
    # Close the driver
    driver.quit()


In [None]:
# anarchist library urls

import requests
from bs4 import BeautifulSoup
import csv
import random

# Number of pages to scrape and total pages available
total_pages = 1007
pages_to_scrape = 60

# Generate 60 random unique page numbers from 1 to 1007
random_pages = random.sample(range(1, total_pages + 1), pages_to_scrape)

# List to store all article URLs
article_links = []

# CSV file to save the URLs
csv_filename = 'anarchist_library_urls.csv'

# Initialize the CSV file
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Article URL'])

# Function to extract article links from a given page URL
def extract_article_links(page_url):
    response = requests.get(page_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {page_url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all article links using the provided CSS selector
    article_anchors = soup.select(
        '#widepage > div.amw-listing-page-body > div > div > div > div:nth-child(1) > a'
    )

    # Save extracted URLs to the CSV file
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for anchor in article_anchors:
            href = anchor['href']
            # Convert relative URLs to absolute URLs if necessary
            if href.startswith('/'):
                href = f"https://theanarchistlibrary.org{href}"
            if href not in article_links:
                article_links.append(href)
                writer.writerow([href])

# Loop through each randomly selected page and scrape the URLs
for page_number in random_pages:
    page_url = f"https://theanarchistlibrary.org/latest/{page_number}"
    print(f"Scraping page: {page_url}")
    extract_article_links(page_url)

# Print a summary of the results
print(f"\nSaved {len(article_links)} article URLs to '{csv_filename}'.")


In [None]:
## baffler urls

import requests
from bs4 import BeautifulSoup
import lxml.etree as ET
import csv

# Base URL for the articles
base_url = "https://thebaffler.com/latest"
article_links = []

def extract_article_links(page_url):
    response = requests.get(page_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {page_url}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all article links using the provided CSS selector
    article_anchors = soup.select(
        '#main > div > div > article > div.sm\\:mt-4.md\\:mt-0.sm\\:w-1\\/2.md\\:w-2\\/5.flex.flex-col.items-center.sm\\:items-start.justify-around.sm\\:justify-start.text-center.sm\\:text-left > a'
    )

    for anchor in article_anchors:
        href = anchor['href']
        # Convert relative URLs to absolute URLs if necessary
        if href.startswith('/'):
            href = f"https://thebaffler.com{href}"
        if href not in article_links:
            article_links.append(href)

    # Convert BeautifulSoup object to lxml for XPath usage
    dom = ET.HTML(str(soup))
    
    # Use the provided XPath to find the "Older Posts" button
    next_page_button = dom.xpath('//*[@id="main"]/div/section[2]/nav/div/div/a')
    if next_page_button:
        next_page_url = next_page_button[0].get('href')
        if next_page_url.startswith('/'):
            return f"https://thebaffler.com{next_page_url}"
        return next_page_url
    return None

try:
    # Loop through all pages until no "Older Posts" button is found or manually stopped
    current_page = base_url
    while current_page:
        current_page = extract_article_links(current_page)

except KeyboardInterrupt:
    print("\nProcess interrupted! Saving collected data...")

finally:
    # Save the extracted URLs to a CSV file
    with open('baffler.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Article URL'])
        for link in article_links:
            writer.writerow([link])

    print(f"\nSaved {len(article_links)} article URLs to 'baffler.csv'.")



In [None]:
## emergence magazine urls

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode to avoid opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
driver = webdriver.Chrome(options=options)

try:
    # Open Emergence Magazine
    driver.get('https://emergencemagazine.org/')

    # Increase wait time to allow all elements to load
    wait = WebDriverWait(driver, 40)

    # Wait for the "Stories" button to be present in the DOM
    stories_button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'c-header-menu__story-button')))
    
    # Scroll to the button to make sure it's visible
    driver.execute_script("arguments[0].scrollIntoView(true);", stories_button)
    time.sleep(2)  # Give a moment for scrolling to complete

    # Click the "Stories" button using JavaScript for reliability
    driver.execute_script("arguments[0].click();", stories_button)

    # Allow time for the stories to load after clicking the button
    time.sleep(5)

    # Extract all article links using a more general selector
    article_links_selector = 'div.c-story-overview__stories > a'
    article_links = driver.find_elements(By.CSS_SELECTOR, article_links_selector)

    # Initialize CSV file with header
    csv_filename = 'emergence_magazine_urls.csv'
    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Article URL'])

        # Save the extracted URLs to the CSV file
        for link in article_links:
            url = link.get_attribute('href')
            writer.writerow([url])

    # Print the summary
    print(f"\nSaved {len(article_links)} article URLs to '{csv_filename}'.")

finally:
    # Close the WebDriver
    driver.quit()


In [None]:
## sapiens urls

import requests
from bs4 import BeautifulSoup
import csv

# URL of the Sapiens "All Articles" page
base_url = "https://www.sapiens.org/all-articles/"

# List to store all article URLs
article_links = []

# CSV file to save the URLs
csv_filename = 'sapiens_urls.csv'

# Initialize the CSV file with a header row
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Article URL'])

# Function to extract article links from the page
def extract_article_links(page_url):
    response = requests.get(page_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {page_url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all article links using the revised CSS selector
    article_anchors = soup.select('a.c-article')

    # Save extracted URLs to the CSV file
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for anchor in article_anchors:
            href = anchor['href']
            # Convert relative URLs to absolute URLs if necessary
            if href.startswith('/'):
                href = f"https://www.sapiens.org{href}"
            if href not in article_links:
                article_links.append(href)
                writer.writerow([href])

# Extract article links from the main page
print(f"Scraping page: {base_url}")
extract_article_links(base_url)

# Print a summary of the results
print(f"\nSaved {len(article_links)} article URLs to '{csv_filename}'.")


In [None]:
## futuress magazine urls (had to save as html and then use beautiful soup to extract the links)

from bs4 import BeautifulSoup
import csv

# Load the saved HTML file
html_filename = 'futuress_stories.html'

# CSV file to save the URLs
csv_filename = 'futuress_urls.csv'

try:
    # Initialize CSV file with a header
    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Article URL'])
    print(f"CSV file '{csv_filename}' initialized successfully.")
except Exception as e:
    print(f"Failed to initialize CSV file: {e}")
    exit()

try:
    # Read the HTML content from the saved file
    with open(html_filename, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    # Extract all article links
    article_links = soup.select('a[href*="/stories/"]')
    print(f"Found {len(article_links)} articles on the page.")

    # Collect URLs and save them to the CSV file
    new_urls = []
    for link in article_links:
        href = link.get('href')
        # Debugging: Output the href being processed
        print(f"Processing href: {href}")

        if href and href not in new_urls:
            # Adjust URL to include the full domain if needed
            full_url = f"https://futuress.org{href}" if href.startswith('/') else href
            new_urls.append(full_url)
            print(f"Collected URL: {full_url}")

    # Save the newly found URLs in bulk to the CSV file
    if new_urls:
        with open(csv_filename, mode='a', newline='') as file:
            writer = csv.writer(file)
            for url in new_urls:
                writer.writerow([url])
        print(f"Saved {len(new_urls)} new URLs to '{csv_filename}'.")
    else:
        print("No new URLs were collected.")

except Exception as e:
    print(f"Failed to extract article URLs: {e}")

print(f"\nSaved all article URLs to '{csv_filename}'.")


In [None]:
## mold urls

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import csv

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode to avoid opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
driver = webdriver.Chrome(options=options)

# CSV file to save the URLs
csv_filename = 'mold_urls.csv'
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Category', 'Article URL'])

# Step 1: Extract category links from the index page
index_url = 'https://thisismold.com/index'
driver.get(index_url)
time.sleep(2)  # Wait for page to load

# Get the page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
category_links = [a['href'] for a in soup.select('body > main > section > div > ul > li > a')]

# Step 2: Visit each category link and extract article links
for category_link in category_links:
    # Make sure to create full URL if the link is relative
    full_category_url = f"https://thisismold.com{category_link}" if category_link.startswith('/') else category_link
    
    # Open the category page
    driver.get(full_category_url)
    time.sleep(2)  # Wait for page to load

    # Get the page source and parse with BeautifulSoup
    category_soup = BeautifulSoup(driver.page_source, 'html.parser')
    article_links = [a['href'] for a in category_soup.select('div.archive-grid a')]

    # Save the category and article URLs to the CSV file
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for article_link in article_links:
            # Make sure to create full URL if the link is relative
            full_article_url = f"https://thisismold.com{article_link}" if article_link.startswith('/') else article_link
            writer.writerow([full_category_url, full_article_url])
            print(f"Saved URL: {full_article_url}")

# Close the WebDriver
driver.quit()
print(f"\nSaved all article URLs to '{csv_filename}'.")


In [None]:
## aeon urls

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from bs4 import BeautifulSoup

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode to avoid opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
driver = webdriver.Chrome(options=options)

# CSV file to save the URLs
csv_filename = 'aeon_pop_essay_urls.csv'
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Article URL'])

try:
    # Open the Aeon essays page
    url = 'https://aeon.co/essays/popular'
    driver.get(url)
    print(f"Opened URL: {url}")

    # Wait for the page to load
    wait = WebDriverWait(driver, 20)

    previous_article_count = 0

    while True:
        # Give time for articles to load
        time.sleep(3)

        # Extract all article links currently visible on the page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article_links = [a['href'] for a in soup.select('div.sc-c83e4c92-0.cFULRC a')]

        # Remove duplicates and save articles to CSV
        new_links = list(set(article_links))  # Remove duplicate links
        new_links = [f"https://aeon.co{link}" if link.startswith('/') else link for link in new_links]
        
        print(f"Found {len(new_links)} articles on page.")

        if len(new_links) == previous_article_count:
            print("No new articles loaded. Ending loop.")
            break

        # Save the newly found URLs in bulk to the CSV file
        with open(csv_filename, mode='a', newline='') as file:
            writer = csv.writer(file)
            for link in new_links:
                writer.writerow([link])
        print(f"Saved {len(new_links)} new URLs to '{csv_filename}'.")
        
        previous_article_count = len(new_links)

        # Click the "More" button to load more articles
        try:
            more_button = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.sc-70f865be-0.dfFAmq > button'))
            )
            driver.execute_script("arguments[0].click();", more_button)
            print("Clicked 'More' button.")
            time.sleep(2)  # Wait briefly for new content to load
        except Exception as e:
            print(f"No more 'More' button found or failed to click: {e}")
            break

finally:
    # Close the WebDriver
    driver.quit()
    print("WebDriver closed.")

print(f"\nSaved all article URLs to '{csv_filename}'.")


In [None]:
## asterisk mag urls

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
from bs4 import BeautifulSoup

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode to avoid opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
driver = webdriver.Chrome(options=options)

# CSV file to save the URLs
csv_filename = 'asterisk_mag.csv'
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Article URL', 'Article Title'])

try:
    # Open the Asterisk articles page
    url = 'https://asteriskmag.com/articles'
    driver.get(url)
    print(f"Opened URL: {url}")

    # Wait for the page to load
    wait = WebDriverWait(driver, 20)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#articles > main > section.archive-index > ul > li')))

    # Extract all article links currently visible on the page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    article_links = soup.select('#articles > main > section.archive-index > ul > li > a')

    # Save the links and titles in the CSV
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for link in article_links:
            href = link['href']
            full_url = f"https://asteriskmag.com{href}" if href.startswith('/') else href
            # Clean title to remove extra whitespace and line breaks
            title = " ".join(link.text.split())
            writer.writerow([full_url, title])
            print(f"Saved URL: {full_url}")

finally:
    # Close the WebDriver
    driver.quit()
    print("WebDriver closed.")

print(f"\nSaved all article URLs to '{csv_filename}'.")


In [None]:
## palladium urls

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode to avoid opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

# CSV file to save the URLs
csv_filename = 'palladium_essays.csv'
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Article URL', 'Article Title'])

try:
    url = 'https://www.palladiummag.com/'
    driver.get(url)
    print(f"Opened URL: {url}")

    wait = WebDriverWait(driver, 20)

    while True:
        # Wait for the articles to load
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'article.post')))

        # Extract all article links currently visible on the page
        articles = driver.find_elements(By.CSS_SELECTOR, 'article.post header.entry-header a')

        # Open CSV file to save URLs
        with open(csv_filename, mode='a', newline='') as file:
            writer = csv.writer(file)
            for article in articles:
                article_url = article.get_attribute('href')
                article_title = article.text

                # Skip URLs that are related to "Palladium" issues
                if "palladium" in article_title.lower():
                    continue

                writer.writerow([article_url, article_title])
                print(f"Saved URL: {article_url}")

        # Check if there is a "Next" button and click it
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'a.next.page-numbers')
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(2)  # Wait for new page content to load
        except:
            print("No more pages to load. Ending loop.")
            break

finally:
    driver.quit()
    print("WebDriver closed.")

print(f"\nSaved all article URLs to '{csv_filename}'.")


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import time

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode to avoid opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

# CSV file to save the URLs
csv_filename = 'slatestarcodex_filtered.csv'
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Article URL', 'Article Title'])

try:
    url = 'https://slatestarcodex.com/archives/'
    driver.get(url)
    print(f"Opened URL: {url}")

    time.sleep(3)  # Give the page some time to load

    # Find all article links in the list
    articles = driver.find_elements(By.CSS_SELECTOR, '#pjgm-content ul li a')

    # Open CSV file to save filtered URLs
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for article in articles:
            article_url = article.get_attribute('href')
            article_title = article.text

            # Filter out unwanted articles
            if any(substring in article_title for substring in ["Open Thread", "Links", "Book Review"]):
                continue

            # Save the filtered article URL and title
            writer.writerow([article_url, article_title])
            print(f"Saved URL: {article_url}")

finally:
    driver.quit()
    print("WebDriver closed.")

print(f"\nSaved all filtered article URLs to '{csv_filename}'.")


In [1]:
from supabase import create_client, Client

# Initialize Supabase client
url = "https://hyxoojvfuuvjcukjohyi.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Imh5eG9vanZmdXV2amN1a2pvaHlpIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjgzMTU4ODMsImV4cCI6MjA0Mzg5MTg4M30.eBQ3JLM9ddCmPeVq_cMIE4qmm9hqr_HaSwR88wDK8w0"
supabase: Client = create_client(url, key)

# Fetch rows where title is exactly "Age of Invention"
response = supabase.table("urls_table").select("id").eq("title", "Age of Invention").execute()

# Extract the IDs of the rows to update
ids_to_update = [row['id'] for row in response.data]

# Update these rows, setting title to NULL
if ids_to_update:
    result = supabase.table("urls_table").update({"title": None}).in_("id", ids_to_update).execute()
    print(f"Updated {len(ids_to_update)} rows, setting 'Age of Invention' titles to NULL")
else:
    print("No rows found with title 'Age of Invention'")

# Optionally, you can print the IDs of updated rows
print("Updated row IDs:", ids_to_update)

Updated 22 rows, setting 'Age of Invention' titles to NULL
Updated row IDs: [1628, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1629, 1630, 1631, 1632, 1633]


In [4]:
import re
from supabase import create_client, Client
# Initialize Supabase client
url = "https://hyxoojvfuuvjcukjohyi.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Imh5eG9vanZmdXV2amN1a2pvaHlpIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjgzMTU4ODMsImV4cCI6MjA0Mzg5MTg4M30.eBQ3JLM9ddCmPeVq_cMIE4qmm9hqr_HaSwR88wDK8w0"
supabase: Client = create_client(url, key)

# Function to clean title and extract author names
def clean_title_and_author(title):
    author = None
    cleaned_title = title.strip()

    # Case 1: [Blog Name]: Essay Title
    if ": " in cleaned_title:
        parts = cleaned_title.split(": ", 1)
        cleaned_title = parts[1]  # Keep the part after the colon

    # Case 2: Essay Title - Author Name
    if " - " in cleaned_title:
        parts = cleaned_title.rsplit(" - ", 1)
        cleaned_title = parts[0].strip()
        author = parts[1].strip()

    # Case 3: Handling edge cases with "by" (Example: Essay Title by Author Name)
    if " by " in cleaned_title.lower():
        parts = re.split(r' by ', cleaned_title, flags=re.IGNORECASE)
        cleaned_title = parts[0].strip()
        author = parts[1].strip()

    # Handle other edge cases or formats if needed
    cleaned_title = cleaned_title.strip(' .-')  # Strip unwanted chars

    return cleaned_title, author

# Fetch rows from the Supabase table
def fetch_data():
    response = supabase.table('urls_table').select('*').execute()
    return response.data  # Correct way to access the data

# Update the table with cleaned titles and author names
def update_table(row_id, title, author):
    update_data = {'title': title}
    if author:
        update_data['author'] = author
    supabase.table('urls_table').update(update_data).eq('id', row_id).execute()

# Main function to clean and update the table
def clean_and_update_titles():
    rows = fetch_data()
    for row in rows:
        row_id = row['id']
        title = row['title']
        author = row.get('author', None)

        # Clean the title and extract the author
        cleaned_title, extracted_author = clean_title_and_author(title)

        # If an author was extracted and the author column is empty, use the extracted author
        if not author and extracted_author:
            author = extracted_author

        # Update the row with the cleaned data
        update_table(row_id, cleaned_title, author)
        print(f"Updated row {row_id}: title -> {cleaned_title}, author -> {author}")

# Run the cleaning process
clean_and_update_titles()

Updated row 1318: title -> Walking Through Beirut’s Emotions, author -> None
Updated row 1567: title -> Jailing Jihadists in the West, author -> None
Updated row 1569: title -> Maronite Christians and the Third Way, author -> None
Updated row 1572: title -> Qassem Soleimani, author -> None
Updated row 1573: title -> After Soleimani, author -> None
Updated row 1574: title -> Syria’s Famished Victory, author -> None
Updated row 1577: title -> Revolutionary Wit, author -> None
Updated row 1578: title -> New Voices of Rebellion Rise in Cuba, author -> None
Updated row 1581: title -> Requiem for a Revolution, author -> None
Updated row 1583: title -> It Felt Like Love, author -> None
Updated row 1584: title -> How Godless Arabs Changed Europe, author -> None
Updated row 1101: title -> ‘A Round of Applause’ Finds the Funny Side of Despondency, author -> None
Updated row 11: title -> The New Robber Barons, author -> None
Updated row 1565: title -> A Notorious Prison and Libya’s War of Memory,

In [None]:
import re
from supabase import create_client, Client

# Initialize Supabase client
url = "https://hyxoojvfuuvjcukjohyi.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Imh5eG9vanZmdXV2amN1a2pvaHlpIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjgzMTU4ODMsImV4cCI6MjA0Mzg5MTg4M30.eBQ3JLM9ddCmPeVq_cMIE4qmm9hqr_HaSwR88wDK8w0"
supabase: Client = create_client(url, key)

# Function to clean title and extract author names
def clean_title_and_author(title):
    author = None
    cleaned_title = title.strip()

    # Case 1: [Blog Name]: Essay Title
    if ": " in cleaned_title:
        parts = cleaned_title.split(": ", 1)
        cleaned_title = parts[1].strip()  # Keep the part after the colon

    # Case 2: Essay Title - Author Name
    if " - " in cleaned_title:
        parts = cleaned_title.rsplit(" - ", 1)
        cleaned_title = parts[0].strip()
        author = parts[1].strip()

    # Case 3: Handling edge cases with "by" (Example: Essay Title by Author Name)
    if " by " in cleaned_title.lower():
        parts = re.split(r' by ', cleaned_title, flags=re.IGNORECASE)
        cleaned_title = parts[0].strip()
        author = parts[1].strip()

    # Handle other edge cases or formats if needed
    cleaned_title = cleaned_title.strip(' .-')  # Strip unwanted chars

    return cleaned_title, author

# Fetch rows from the Supabase table
def fetch_data():
    response = supabase.table('urls_table').select('*').execute()
    return response.data  # Correct way to access the data

# Update the table with cleaned titles and author names
def update_table(row_id, title, author):
    update_data = {}
    if title:
        update_data['title'] = title
    if author:
        update_data['author'] = author
    if update_data:
        supabase.table('urls_table').update(update_data).eq('id', row_id).execute()

# Main function to clean and update the table
def clean_and_update_titles():
    rows = fetch_data()
    for row in rows:
        row_id = row['id']
        title = row['title']
        author = row.get('author', None)

        # Only clean rows where title exists and isn't NULL
        if title and title.strip() != "":
            cleaned_title, extracted_author = clean_title_and_author(title)

            # Update only if there's something to update
            if cleaned_title != title or (extracted_author and not author):
                # If the cleaned title is different or we extracted an author
                if not author and extracted_author:
                    author = extracted_author

                update_table(row_id, cleaned_title, author)
                print(f"Updated row {row_id}: title -> {cleaned_title}, author -> {author}")
            else:
                print(f"Skipped row {row_id}: No change needed")
        else:
            print(f"Skipped row {row_id}: Empty or NULL title")

# Run the cleaning process
clean_and_update_titles()
