In [48]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import os

def setup_driver(url):
    """Initialize and setup the Chrome webdriver"""
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(0.3)  # Wait for initial load
    return driver

def get_title(card):
    """Extract title from a card element"""
    try:
        title_element = card.find_element(By.CSS_SELECTOR, ".Result_resultTitle__Lt8Y6 a span")
        return title_element.text.strip()
    except:
        return None

def get_link(card):
    """Extract link from a card element. Example link format:
    /technologies/exogenous-rac1--CU23206"""
    try:
        link_element = card.find_element(By.CSS_SELECTOR, ".Result_resultTitle__Lt8Y6 a")
        return link_element.get_attribute("href")
    except:
        return None
    
def get_id(card):
    """Extract ID from card. <div class="md-up">CU23375</div>"""
    try:
        id_element = card.find_element(By.CSS_SELECTOR, ".md-up")
        return id_element.text.strip()
    except:
        return None

def get_items_from_page(driver):
    """Scroll through the page and collect all unique titles"""
    body = driver.find_element(By.TAG_NAME, "body")
    prev_count = 0
    all_titles = []
    all_links = []
    all_ids = []

    for i in range(50):  # Adjust the number of scroll attempts as needed
        # Scroll down
        for _ in range(3):
            body.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.1)  # Wait for content to load after scrolling

        # Get current cards and titles
        current_cards = driver.find_elements(By.CSS_SELECTOR, ".Result_resultCard__iJcI0")
        current_count = len(current_cards)

        # Extract titles
        for card in current_cards:
            title = get_title(card)
            if title:
                all_titles.append(title)
                all_links.append(get_link(card))
                all_ids.append(get_id(card))
        if current_count == prev_count:
            break

        prev_count = current_count

    return list(zip(all_titles, all_links, all_ids))

def print_results(results):
    """Print the collected titles"""
    print(f"Total unique titles collected: {len(results)}")
    for title, link, id in results:
        print(title, link, id)

def main(url):
    driver = setup_driver(url)
    
    try:
        results = get_items_from_page(driver)
        print_results(results)
    finally:
        driver.quit()

if __name__ == "__main__":
    main("https://inventions.techventures.columbia.edu/categories")

Total unique titles collected: 20
Exogenous Rac1 analogs to treat acute lung injury https://inventions.techventures.columbia.edu/technologies/exogenous-rac1--CU23206 CU23206
Orally administered drug with extended-release for obesity treatment https://inventions.techventures.columbia.edu/technologies/orally-administered--CU23375 CU23375
Targeting glial cells for treating myelofibrosis https://inventions.techventures.columbia.edu/technologies/targeting-glial--CU23243 CU23243
Graphene-based optical modulator for low-power, high-sensitivity cryogenic data transmission https://inventions.techventures.columbia.edu/technologies/graphene-based--CU16109 CU16109
Annexin A1 monoclonal antibodies for colorectal and chemo-resistant cancer therapy https://inventions.techventures.columbia.edu/technologies/annexin-a1--CU21239 CU21239
Robotic traction head-neck brace for improved mobility https://inventions.techventures.columbia.edu/technologies/robotic-traction--CU22116 CU22116
Gene silencing for the 

In [None]:
max_pages = 158
for page in range(1, max_pages + 1):
    print(f"Scraping page {page}")
    main(page)