In [41]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Setup Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run headless Chrome, optional
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

In [2]:
# !pip3 install selenium

In [None]:
"""
TimeOut LA
Discover LA
WE Like LA
Secret Los Angeles
KCRW Events
"""

In [2]:
def pull_timeout_calendar(driver):

    # Function to scroll down the page
    def scroll_down(driver):
        # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         driver.execute_script("window.scrollBy(0, 500);")
         new_height = driver.execute_script("return document.body.scrollHeight")
         return(new_height)
        
    def collect_titles(article_urls, driver):
        # Get articles on the page
        title_divs = driver.find_elements(By.XPATH, '//*[contains(@class, "title")]')
        print(len(title_divs),"title divs in DOM")
        added = 0
    
        # Collect URLs from within these divs
        for title in title_divs:
            links = title.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href and 'timeout.com/los-angeles/' in href:
                    if href not in article_urls:
                        article_urls.append(href)
                        added += 1
        print(added, "new articles")
    
        return(added)

    def pull_tout_event_info(driver, article_url):
        """
        takes in articule_url, outputs pandas DF
        """
        import re
        
        event_title = []
        event_description = []
        event_time = []
        event_location = []
        event_price = []
        event_urls = []
        
        for event_url in article_urls:
            print(event_url)
            driver.get(event_url)
            time.sleep(2)
        
            title = event_url.split('/')[-1].replace("-"," ").title()
            # event_title.append(title)
            
            try:
                details_sections = driver.find_element(By.CSS_SELECTOR, 'section[data-section-name="review"]')
                descr_txt = details_sections.text
            except:
                descr_txt = "None."
            try:
                occurrence_section = driver.find_element(By.CSS_SELECTOR, 'section[data-section-name="occurrences"]')
                zones = occurrence_section.find_element(By.CLASS_NAME, 'zoneItems')
                child_divs = zones.find_elements(By.XPATH, './div')
                timestamps = []
                venues = []
                prices = []
                for entry in child_divs:
                    time_element = entry.find_element(By.TAG_NAME, 'time')
                    timestamp = time_element.get_attribute('datetime') 
                    timestamps.append(timestamp)
        
                    try:
                        venue = entry.find_element(By.XPATH, './/span[contains(@class, "venueName")]/a').text
                        venues.append(venue)
                    except:
                        venues.append("unknown")
        
                    try:
                        price = entry.find_element(By.XPATH, './/div[contains(@class, "price")]').text
                        prices.append(price)
                    except:
                        prices.append("unknown")
            except:
                timestamps = ["unknown"]; venues = ["unknown"]; prices = ["unknown"]
                
                
            for i in range(len(timestamps)):
                if len(timestamps) > 1:
                    event_title.append(title+f" (Day {i+1}")
                else:
                    event_title.append(title)
                event_time.append(timestamps[i])
                event_location.append(venues[i])
                event_price.append(prices[i])
                event_description.append(descr_txt)
                event_urls.append(event_url)
        
        driver.quit()
    
        p = pd.DataFrame({"Event Name":event_title, 
                          "Description":event_description,
                          "Time":event_time,
                          "Location":event_location,
                          "Price": event_price,
                          "Event URL":event_urls,
                          "Description":event_description})
        return(p)
    
    # Create a new instance of the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)

    months = ['july','august','september','october','november','december']

    events_pd = pd.DataFrame({'Event Name':[],
                              'Description':[],
                              'Time':[],
                              'Location':[],
                              'Price':[],
                              'Event URL':[]})
    for month in months:
        print(month)
    
        # Navigate to the Time Out Los Angeles page
        url = f"https://www.timeout.com/los-angeles/things-to-do/{month}-events-calendar"
        try:
            driver.get(url)
        except:
            continue
        
        # Wait for the page to load
        wait = WebDriverWait(driver, 20)  # Increase the wait time if needed
        
        # Initialize a list to store the article URLs
        article_urls = list()
        
        # Scroll and collect article URLs
        SCROLL_PAUSE_TIME = 1
        max_attempts = 50  # Maximum number of scrolls
        attempts = 0
        num_hrefs = 0
        
        
        # Scroll 60% of the way to start
        total_height = driver.execute_script("return document.body.scrollHeight")
        scroll_height = total_height * 0.40
        driver.execute_script(f"window.scrollBy(0, {scroll_height});")
        time.sleep(3)
        height = total_height
        
        # Get all the events 
        while True:
            # Scroll down the page
            """
            This strange scrolling + checking issue
            Is because I can't find a good way to get the 
            feed to continually refresh as the bot
            scrolls down the page -- if I scroll too fast
            it doesn't update and we get stuck on the footer
            and don't refresh, too slow and it takes forever
            to run.
        
            Currently it moves down 500px, and checks 
            every 50 scrolls if new content has emerged.
            I'm sure there is a way to simple add a conditional
            but MOVING ON.
            """
            new_height = scroll_down(driver)
            # print("#", attempts)
            
            # Wait to load the page
            time.sleep(SCROLL_PAUSE_TIME)
            
            # Only check every 50 scrolls
            attempts += 1
            if attempts % 50 != 0:
                continue
            # Collect new titles
            else:
                added = collect_titles(article_urls, driver)
                if added == 0:
                    break
        
        print(len(article_urls), 'total articles found.')
        p = pull_tout_event_info(driver, article_urls)

        events_pd = pd.concat([events_pd,p])
    
    # Close the browser
    driver.quit()

In [None]:
## Discover Los Angeles
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_links(driver):
    return(
        driver.find_elements(By.XPATH, "//a[not(contains(@style, 'display:none')) and not(contains(@style, 'visibility:hidden'))]")
    )
def load_more_days(driver):
    for _ in range(5): # only try 5 times for now
        try:
            time.sleep(3)
            element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//*[contains(@id, 'load-more')]"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", element)
            driver.execute_script(f"window.scrollBy(0, -100);")
            element.click()
        except:
            print("can't load anymore.")
            break
        try:
            element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "img[name^='ajax-loader']")))
            # Takes a long time to load...
            element = WebDriverWait(driver, 1000).until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, "img[name^='ajax-loader']")))
            print("The loading GIF has disappeared.")
        except:
            print("The loading GIF is still visible or the timeout was reached.")
            break
    return(True)

def load_more_daily_events(driver):
    anchors = get_links(driver)
    print("links before:", len(anchors))
    time.sleep(3)
    elements = driver.find_elements(By.CSS_SELECTOR, "ul")
    for e in elements:
        if 'item' in e.text:
            item_element = e.find_element(By.XPATH, ".//*[contains(text(), 'item')]")
            print(item_element.text)
            driver.execute_script("arguments[0].click();", item_element)
            time.sleep(1)
    anchors = get_links(driver)
    print("links after:",len(links))

    return(True)

def collect_links(driver):
    anchors = get_links(driver)
    c = 0
    visited_links = []
    for a in anchors:
        link = a.get_attribute('href')
        
        if not link:
            continue
        elif ('https://www.discoverlosangeles.com/event/' not in link) or link in visited_links:
            continue
        else:
            visited_links.append(link)
    print(len(visited_links)," events found.")

    return(visited_links)

def collect_event_info(driver,visited_links):

    name_list = []
    price_list = []
    date_list = []
    time_list = []
    location_list = []
    desc_list = []
    for link in visited_links:
        name = link.split('/')[-1].replace('-',' ').strip().title()
        # print(link)
        # print(name)
        driver.get(link)
    
        price = None
        date = None
        time_ = None
    
        ## Get event details
        try:
            info = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='dla-information']"))
        )
            info = info.text
            info = ' '.join(info.split()).strip()
    
            for fact in info.split("|"):
                if '$' in fact:
                    price = '$'+fact.split("$")[-1]
                elif 'PM' in fact or 'AM' in fact:
                    time_ = ' '.join(fact.split())
                elif '2024' in fact:
                    date = ' '.join(fact.split())
                    ## TODO: There are date ranges in here!
                    if "-" in date:
                        date = " ".join(date.split("-")[0].split())
                    else:
                        date = date.split(", 2024")[0]
            # print("Price:", price)
            # print("Date:", date)
            # print("Time:", time)
        except:
            # print("no info")
            info = None
    
        ## Get location
        try:
            location = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "span[class*='dla-venue']"))
        )
            location = location.text
            location = ' '.join(location.split()).strip()
            # print("LOCATION: ",location)
        except:
            # print("no location")
            location = None
    
        try:
            description = driver.find_element(By.XPATH, "//div[@class='dla-txt']")
            # description = location.text
            # description = ' '.join(description.split()).strip()
            description = description.text
            # print("DESC: ",description)
        except:
            # print("no desc")
            description = None
    
        name_list.append(name)
        price_list.append(price)
        date_list.append(date)
        time_list.append(time_)
        location_list.append(location)
        desc_list.append(description)
    
    p = pd.DataFrame({
        "Name":name_list,
        "Price":price_list,
        "Date":date_list,
        "Time":time_list,
        "Location":location_list,
        "Description":desc_list
    })
    return(p)

url = 'https://www.discoverlosangeles.com/events'
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)

load_more_days(driver)
load_more_daily_events(driver)
visited_links = collect_links(driver)
p = collect_event_info(driver, visited_links)

driver.quit()

Unnamed: 0,Name,Price,Date,Time,Location,Description
0,Tchaikovsky Spectacular With Fireworks,$18.00,Aug 3,8:00PM - 10:00PM,Hollywood Bowl,"Cymbals crash, horns blare, and fireworks blas..."
1,John Legend,$299.50,Aug 2,8:00PM - 10:00PM,Greek Theatre,John Legend plays the Greek.
2,Reggae Night Xxii Jamrock Reggae Night At The ...,$450,Aug 4,7:00PM - 10:00PM,Hollywood Bowl Parking,Come experience world-renowned Jamaican reggae...
3,Norah Jones With Mavis Staples,$150,Aug 4,7:30PM - 10:00PM,Greek Theatre,NORAH JONES With Mavis Staples
4,Oar Summer Tour 24 With Special Guest Fitz And...,$100.50,Aug 8,6:55PM - 11:00PM,Greek Theatre,O.A.R. Summer Tour 24 with special guest Fitz ...
...,...,...,...,...,...,...
275,Las Shakespeare In The Park As You Like It,,Aug 11,6:00PM - 10:00PM,Old Los Angeles Zoo,"Pack your picnic basket, shake out your blanke..."
276,Underground Worlds Centering Sound In Art,,Aug 11,6:00PM - 9:00PM,Armory Center for the Arts,"Join us for an unforgettable evening of music,..."
277,Khruangbin,,Aug 11,7:00PM - 11:00PM,Hollywood Bowl,"Over the last decade and four albums, Khruangb..."
278,Moneyball In 4K,$10,Aug 11,7:30PM - 10:15PM,Academy Museum of Motion Pictures,Guest speaker: Pre-screening conversation with...


In [51]:
from collections import Counter
Counter(p.Date.values)

Counter({'Aug 3': 50,
         'Aug 10': 42,
         'Aug 4': 28,
         'Aug 2': 20,
         'Aug 11': 20,
         'Aug 9': 18,
         'Aug 8': 17,
         'Aug 7': 10,
         'Aug 5': 7,
         'Jun 22': 5,
         'Aug 6': 5,
         'Jul 11': 4,
         'Sep 4': 3,
         'Jun 8': 3,
         'Jul 13': 3,
         'May 26': 3,
         None: 3,
         'Jul 19': 3,
         'Aug 18': 2,
         'Aug 22': 2,
         'Jun 29': 2,
         'Jun 1': 2,
         'Aug 13': 1,
         'Aug 14': 1,
         'Aug 16': 1,
         'Aug 25': 1,
         'Aug 23': 1,
         'Aug 28': 1,
         'Aug 31': 1,
         'Sep 6': 1,
         'Sep 7': 1,
         'Sep 8': 1,
         'Sep 14': 1,
         'Sep 22': 1,
         'Jul 9': 1,
         'Jun 10': 1,
         'Jul 1': 1,
         'Jun 28': 1,
         'Jun 14': 1,
         'Apr 11': 1,
         'May 30': 1,
         'Jun 21': 1,
         'May 4': 1,
         'Jun 7': 1,
         'Jun 2': 1,
         'Aug 1': 1,
    

In [28]:
d = None
for fact in d.split("|"):
    print('y')
else:
    print('n')

AttributeError: 'NoneType' object has no attribute 'split'

In [6]:
c = 0
for a in anchors:
    link = a.get_attribute('href')
    print(link)
    c+=1

    if c > 20:
        break

MaxRetryError: HTTPConnectionPool(host='localhost', port=53092): Max retries exceeded with url: /session/7cf2ea506a327f2ca8df9b42e6b3b26a/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x125eeb860>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [87]:
# # discover_keywords
# driver = webdriver.Chrome(options=chrome_options)
# for url in all_links:
#     driver.get(url)
    
#     if 'event/' in url:
#         print(url)
        
#         # article = driver.find_element(By.XPATH, '//div[contains(@class, "page-content")]')
#         # if article:
#         #    content = article.text
#         #    print(content)
#         # break
#     elif 'things-to-do' in url:
#         article = driver.find_element(By.XPATH, '//div[contains(@class, "page-content")]')
#         links = article.find_elements(By.TAG_NAME, 'a')
        
#         for link in links:
#         # Extract the href attribute
#             href = link.get_attribute('href')
#             print(href)
#         break
           
           
# driver.quit()

https://www.discoverlosangeles.com/event/2024/07/27/strong-words-at-socalo
https://www.discoverlosangeles.com/event/2024/08/10/jon-batiste
https://www.discoverlosangeles.com/event/2024/08/23/ringling-bros-and-barnum-bailey-the-greatest-show-on-earth
https://www.discoverlosangeles.com/event/2024/07/27/arsenal-vs-manchester-united
None
https://www.discoverlosangeles.com/things-to-do
https://www.discoverlosangeles.com/things-to-do/attractions-tours
https://www.discoverlosangeles.com/things-to-do/go-behind-the-scenes-at-the-best-studio-tours-in-la
https://www.discoverlosangeles.com/node/4779
https://www.discoverlosangeles.com/things-to-do
https://www.discoverlosangeles.com/things-to-do/arts
https://www.discoverlosangeles.com/things-to-do/the-academy-museum-of-motion-pictures-10-things-you-cant-miss
https://www.discoverlosangeles.com/things-to-do/the-academy-museum-of-motion-pictures-10-things-you-cant-miss
https://www.discoverlosangeles.com/node/81151
https://www.discoverlosangeles.com/thi