In [12]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Setup Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run headless Chrome, optional
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

In [None]:
!pip3 install selenium

In [None]:
"""
TimeOut LA
Discover LA
WE Like LA
Secret Los Angeles
KCRW Events
"""

"""
1. Calendar of Events
2. Evergreen Activities (Museums, hikes, etc)
"""

In [2]:
def pull_timeout_calendar(driver):

    # Function to scroll down the page
    def scroll_down(driver):
        # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         driver.execute_script("window.scrollBy(0, 500);")
         new_height = driver.execute_script("return document.body.scrollHeight")
         return(new_height)
        
    def collect_titles(article_urls, driver):
        # Get articles on the page
        title_divs = driver.find_elements(By.XPATH, '//*[contains(@class, "title")]')
        print(len(title_divs),"title divs in DOM")
        added = 0
    
        # Collect URLs from within these divs
        for title in title_divs:
            links = title.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href and 'timeout.com/los-angeles/' in href:
                    if href not in article_urls:
                        article_urls.append(href)
                        added += 1
        print(added, "new articles")
    
        return(added)

    def pull_tout_event_info(driver, article_url):
        """
        takes in articule_url, outputs pandas DF
        """
        import re
        
        # c = 0
        # BREAK_AT = len(article_urls)
        # driver = webdriver.Chrome(options=chrome_options)
        
        event_title = []
        event_description = []
        event_time = []
        event_location = []
        event_price = []
        event_urls = []
        
        for event_url in article_urls:
            print(event_url)
            driver.get(event_url)
            time.sleep(2)
        
            title = event_url.split('/')[-1].replace("-"," ").title()
            # event_title.append(title)
            
            try:
                details_sections = driver.find_element(By.CSS_SELECTOR, 'section[data-section-name="review"]')
                descr_txt = details_sections.text
            except:
                descr_txt = "None."
            try:
                occurrence_section = driver.find_element(By.CSS_SELECTOR, 'section[data-section-name="occurrences"]')
                zones = occurrence_section.find_element(By.CLASS_NAME, 'zoneItems')
                child_divs = zones.find_elements(By.XPATH, './div')
                timestamps = []
                venues = []
                prices = []
                for entry in child_divs:
                    time_element = entry.find_element(By.TAG_NAME, 'time')
                    timestamp = time_element.get_attribute('datetime') 
                    timestamps.append(timestamp)
        
                    try:
                        venue = entry.find_element(By.XPATH, './/span[contains(@class, "venueName")]/a').text
                        venues.append(venue)
                    except:
                        venues.append("unknown")
        
                    try:
                        price = entry.find_element(By.XPATH, './/div[contains(@class, "price")]').text
                        prices.append(price)
                    except:
                        prices.append("unknown")
            except:
                timestamps = ["unknown"]; venues = ["unknown"]; prices = ["unknown"]
                
                
            for i in range(len(timestamps)):
                if len(timestamps) > 1:
                    event_title.append(title+f" (Day {i+1}")
                else:
                    event_title.append(title)
                event_time.append(timestamps[i])
                event_location.append(venues[i])
                event_price.append(prices[i])
                event_description.append(descr_txt)
                event_urls.append(event_url)
        
        driver.quit()
    
        p = pd.DataFrame({"Event Name":event_title, 
                          "Description":event_description,
                          "Time":event_time,
                          "Location":event_location,
                          "Price": event_price,
                          "Event URL":event_urls,
                          "Description":event_description})
        return(p)
    
    # Create a new instance of the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)

    months = ['july','august','september','october','november','december']

    events_pd = pd.DataFrame({'Event Name':[],
                              'Description':[],
                              'Time':[],
                              'Location':[],
                              'Price':[],
                              'Event URL':[]})
    for month in months:
        print(month)
    
        # Navigate to the Time Out Los Angeles page
        url = f"https://www.timeout.com/los-angeles/things-to-do/{month}-events-calendar"
        try:
            driver.get(url)
        except:
            continue
        
        # Wait for the page to load
        wait = WebDriverWait(driver, 20)  # Increase the wait time if needed
        
        # Initialize a list to store the article URLs
        article_urls = list()
        
        # Scroll and collect article URLs
        SCROLL_PAUSE_TIME = 1
        max_attempts = 50  # Maximum number of scrolls
        attempts = 0
        num_hrefs = 0
        
        
        # Scroll 60% of the way to start
        total_height = driver.execute_script("return document.body.scrollHeight")
        scroll_height = total_height * 0.40
        driver.execute_script(f"window.scrollBy(0, {scroll_height});")
        time.sleep(3)
        height = total_height
        
        # Get all the events 
        while True:
            # Scroll down the page
            """
            This strange scrolling + checking issue
            Is because I can't find a good way to get the 
            feed to continually refresh as the bot
            scrolls down the page -- if I scroll too fast
            it doesn't update and we get stuck on the footer
            and don't refresh, too slow and it takes forever
            to run.
        
            Currently it moves down 500px, and checks 
            every 50 scrolls if new content has emerged.
            I'm sure there is a way to simple add a conditional
            but MOVING ON.
            """
            new_height = scroll_down(driver)
            # print("#", attempts)
            
            # Wait to load the page
            time.sleep(SCROLL_PAUSE_TIME)
            
            # Only check every 50 scrolls
            attempts += 1
            if attempts % 50 != 0:
                continue
            # Collect new titles
            else:
                added = collect_titles(article_urls, driver)
                if added == 0:
                    break
        
        print(len(article_urls), 'total articles found.')
        p = pull_tout_event_info(driver, article_urls)

        events_pd = pd.concat([events_pd,p])
    
    # Close the browser
    driver.quit()

In [20]:
## Discover Los Angeles
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

url = 'https://www.discoverlosangeles.com/events'
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
for _ in range(5):
    time.sleep(5)
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//*[contains(@id, 'load-more')]"))
    )
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    element.click
elements = driver.find_elements("xpath", "//*[@title='See all content']")
for elem in elements:
    time.sleep(2)
    link = elem.get_attribute('href')
    driver.get(link)
driver.quit()

StaleElementReferenceException: Message: stale element reference: stale element not found
  (Session info: chrome=118.0.5993.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
0   chromedriver                        0x00000001026ebe08 chromedriver + 5025288
1   chromedriver                        0x00000001026e2c23 chromedriver + 4987939
2   chromedriver                        0x0000000102284e67 chromedriver + 409191
3   chromedriver                        0x000000010229663c chromedriver + 480828
4   chromedriver                        0x000000010228bc84 chromedriver + 437380
5   chromedriver                        0x000000010228bd61 chromedriver + 437601
6   chromedriver                        0x0000000102289c7f chromedriver + 429183
7   chromedriver                        0x000000010228dac0 chromedriver + 445120
8   chromedriver                        0x0000000102317f8b chromedriver + 1011595
9   chromedriver                        0x00000001022fb4e2 chromedriver + 894178
10  chromedriver                        0x0000000102317571 chromedriver + 1009009
11  chromedriver                        0x00000001022fb2b3 chromedriver + 893619
12  chromedriver                        0x00000001022c5eb9 chromedriver + 675513
13  chromedriver                        0x00000001022c70ee chromedriver + 680174
14  chromedriver                        0x00000001026ad819 chromedriver + 4769817
15  chromedriver                        0x00000001026b2893 chromedriver + 4790419
16  chromedriver                        0x00000001026b966e chromedriver + 4818542
17  chromedriver                        0x00000001026b35bd chromedriver + 4793789
18  chromedriver                        0x000000010268598c chromedriver + 4606348
19  chromedriver                        0x00000001026d1b78 chromedriver + 4918136
20  chromedriver                        0x00000001026d1d30 chromedriver + 4918576
21  chromedriver                        0x00000001026e285e chromedriver + 4986974
22  libsystem_pthread.dylib             0x00007ff818f2818b _pthread_start + 99
23  libsystem_pthread.dylib             0x00007ff818f23ae3 thread_start + 15


In [87]:
# # discover_keywords
# driver = webdriver.Chrome(options=chrome_options)
# for url in all_links:
#     driver.get(url)
    
#     if 'event/' in url:
#         print(url)
        
#         # article = driver.find_element(By.XPATH, '//div[contains(@class, "page-content")]')
#         # if article:
#         #    content = article.text
#         #    print(content)
#         # break
#     elif 'things-to-do' in url:
#         article = driver.find_element(By.XPATH, '//div[contains(@class, "page-content")]')
#         links = article.find_elements(By.TAG_NAME, 'a')
        
#         for link in links:
#         # Extract the href attribute
#             href = link.get_attribute('href')
#             print(href)
#         break
           
           
# driver.quit()

https://www.discoverlosangeles.com/event/2024/07/27/strong-words-at-socalo
https://www.discoverlosangeles.com/event/2024/08/10/jon-batiste
https://www.discoverlosangeles.com/event/2024/08/23/ringling-bros-and-barnum-bailey-the-greatest-show-on-earth
https://www.discoverlosangeles.com/event/2024/07/27/arsenal-vs-manchester-united
None
https://www.discoverlosangeles.com/things-to-do
https://www.discoverlosangeles.com/things-to-do/attractions-tours
https://www.discoverlosangeles.com/things-to-do/go-behind-the-scenes-at-the-best-studio-tours-in-la
https://www.discoverlosangeles.com/node/4779
https://www.discoverlosangeles.com/things-to-do
https://www.discoverlosangeles.com/things-to-do/arts
https://www.discoverlosangeles.com/things-to-do/the-academy-museum-of-motion-pictures-10-things-you-cant-miss
https://www.discoverlosangeles.com/things-to-do/the-academy-museum-of-motion-pictures-10-things-you-cant-miss
https://www.discoverlosangeles.com/node/81151
https://www.discoverlosangeles.com/thi