<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/Events_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
def clean_string(string):
    # Remove unwanted characters like tabs and extra spaces
    cleaned_string = re.sub(r'[\t\n\r]+', '', string)  # Remove tabs, newlines, and carriage returns
    cleaned_string = re.sub(r'\s{2,}', ' ', cleaned_string)  # Replace multiple spaces with a single space
    cleaned_string = cleaned_string.strip()  # Strip leading/trailing spaces

    return cleaned_string

In [None]:
res = requests.get('https://www.visitberlin.de/en/event-calendar-berlin?page=0')
soup = BeautifulSoup(res.text, 'html.parser')
berlinEvents = soup.find_all('li', class_='l-list__item')
len(berlinEvents)

21

In [None]:
# @title
events = []
base_url = "https://www.visitberlin.de"

for event in berlinEvents:
    # Extract event name, if available
    event_name = event.find('a', class_='teaser-search__mainlink')
    event_name = event_name.get('title', '').strip() if event_name else None

    # Extract picture link, if available
    picture_tag = event.find('img', class_='teaser-search__img fluid-img')
    picture_link = base_url + picture_tag.get('src') if picture_tag else None

    # Extract category, if available
    category_tag = event.find('div', class_='teaser-search__category')
    category = category_tag.get_text(strip=True) if category_tag else None

    # Extract address, if available
    address_tag = event.find('span', class_='teaser-search__print-address teaser-search__print-info')
    address = address_tag.get_text(strip=True) if address_tag else None

    # Extract website, if available
    website_tag = event.find('p', class_='teaser-search__print-link teaser-search__print-info')
    website = website_tag.get_text(strip=True) if website_tag else None

    # Extract time, if available
    time = ""
    time_tag = event.find('p', class_='teaser-search__time me')
    if time_tag:
        time = time_tag.select_one('span.me__content').get_text(strip=True)

    # Extract start and end dates, if available
    start_date, end_date = None, None
    date_tag = event.find('p', class_='teaser-search__date')
    if date_tag:
        times = date_tag.find_all('time')
        if len(times) > 1:
            start_date = times[0].get_text(strip=True)
            end_date = times[1].get_text(strip=True)
        elif len(times) == 1:
            start_date = times[0].get_text(strip=True)

    # Create the event info dictionary
    # print(time)
    event_info = {
        'event_name': event_name,
        'picture_link': picture_link,
        'category': category,
        'address': address,
        'city': 'Berlin',
        'country': 'Germany',
        'event_date': f"{start_date} - {end_date}" if start_date and end_date else start_date or None,
        'opening_hours': clean_string(time),
        'address': address,
        'ticket_price': "",
        'ticket_link': website,
        'source_website': base_url
    }

    # Add the event info to the events list
    events.append(event_info)

events

In [None]:
base_url = "https://www.visitberlin.de"
page = 0
events = []

while True:
    res = requests.get(f'{base_url}/en/event-calendar-berlin?page={page}')
    soup = BeautifulSoup(res.text, 'html.parser')
    berlinEvents = soup.find_all('li', class_='l-list__item')

    if not berlinEvents:  # If no events are found, break the loop
        break

    for event in berlinEvents:
        # Extract event name, if available
        event_name = event.find('a', class_='teaser-search__mainlink')
        event_name = event_name.get('title', '').strip() if event_name else None

        # Extract picture link, if available
        picture_tag = event.find('img', class_='teaser-search__img fluid-img')
        picture_link = base_url + picture_tag.get('src') if picture_tag else None

        # Extract category, if available
        category_tag = event.find('div', class_='teaser-search__category')
        category = category_tag.get_text(strip=True) if category_tag else None

        # Extract address, if available
        address_tag = event.find('span', class_='teaser-search__print-address teaser-search__print-info')
        address = address_tag.get_text(strip=True) if address_tag else None

        # Extract website, if available
        website_tag = event.find('p', class_='teaser-search__print-link teaser-search__print-info')
        website = website_tag.get_text(strip=True) if website_tag else None

        # Extract time, if available
        time = ""
        time_tag = event.find('p', class_='teaser-search__time me')
        if time_tag:
            time = time_tag.select_one('span.me__content').get_text(strip=True)

        # Extract start and end dates, if available
        start_date, end_date = None, None
        date_tag = event.find('p', class_='teaser-search__date')
        if date_tag:
            times = date_tag.find_all('time')
            if len(times) > 1:
                start_date = times[0].get_text(strip=True)
                end_date = times[1].get_text(strip=True)
            elif len(times) == 1:
                start_date = times[0].get_text(strip=True)

        # Create the event info dictionary
        event_info = {
            'event_name': event_name,
            'picture_link': picture_link,
            'category': category,
            'address': address,
            'city': 'Berlin',
            'country': 'Germany',
            'event_date': f"{start_date} - {end_date}" if start_date and end_date else start_date or None,
            'opening_hours': clean_string(time),
            'address': address,
            'ticket_price': "",
            'ticket_link': website,
            'source_website': base_url
        }

        # Add the event info to the events list
        events.append(event_info)
    print(page)
    page += 1  # Increment page for the next iteration

# Convert events list to Pandas DataFrame
df = pd.DataFrame(events)
df

In [None]:
df = pd.DataFrame(events)
df.to_excel('berlin_events.xlsx', index=False)

In [None]:
hamburgUrl = "https://www.hamburg-travel.com/see-explore/events/events-calendar/js.api?filter[date]=05.10.2024&filter[district]=all&page=0&filter[distance]=50"
res = requests.get(hamburgUrl)
soup = BeautifulSoup(res.text, 'html.parser')
hamburgEvents = soup.find_all('article', class_='listTeaser-event')
len(hamburgEvents)

3

In [None]:
events = []
base_url = "https://www.hamburg-travel.com/"
for event in hamburgEvents:
    # Extract event details
    event_name = event.find('h3').text if event.find('h3') else None
    event_type = event.find('ul', class_='listTeaser-event__text__profiling').text.strip() if event.find('ul', class_='listTeaser-event__text__profiling') else None
    date = event.find('span', class_='icon-calendar').next_sibling.strip() if event.find('span', class_='icon-calendar') else None
    time = event.find('span', class_='icon-clock').next_sibling.strip() if event.find('span', class_='icon-clock') else None
    location = event.find('span', class_='icon-located').next_sibling.strip() if event.find('span', class_='icon-located') else None
    picture_link = event.find('img')['src'] if event.find('img') else None
    imo_number = None
    departure_info = None

    # Check for IMO number and departure time
    text_sections = event.find_all('p')
    for p in text_sections:
        if 'IMO number' in p.text:
            imo_number = p.text.split('IMO number: ')[1].split()[0] if 'IMO number' in p.text else None
        if 'Departure' in p.text:
            departure_info = event.find('ul').find('li').text.strip()

    event_info = {
        'event_name': event_name,
        'picture_link': picture_link,
        'event_type': event_type,
        'date': date,
        'time': time,
        'adress': location,
        'city': 'Hamburg',
        'country': 'Germany',
        'ticket_price': "",
        'ticket_link': "",
        'source_website': base_url,
        'note': departure_info
    }
    events.append(event_info)

events

In [None]:
# @title
# Base URL with placeholders for dynamic page number
base_url = "https://www.hamburg-travel.com/"
hamburgUrl = "https://www.hamburg-travel.com/see-explore/events/events-calendar/js.api?filter[district]=all&pageDate=05.10.2024&page={}&filter[distance]=50"

events = []
page = 0

while True:
    # Fetch page content
    url = hamburgUrl.format(page)
    print(url)
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    hamburgEvents = soup.find_all('article', class_='listTeaser-event')

    if not hamburgEvents:  # Break loop if no more events are found
        break

    for event in hamburgEvents:
        # Extract event details
        event_name = event.find('h3').text if event.find('h3') else None
        event_type = event.find('ul', class_='listTeaser-event__text__profiling').text.strip() if event.find('ul', class_='listTeaser-event__text__profiling') else None
        date = event.find('span', class_='icon-calendar').next_sibling.strip() if event.find('span', class_='icon-calendar') else None
        time = event.find('span', class_='icon-clock').next_sibling.strip() if event.find('span', class_='icon-clock') else None
        location = event.find('span', class_='icon-located').next_sibling.strip() if event.find('span', class_='icon-located') else None
        picture_link = event.find('img')['src'] if event.find('img') else None
        imo_number = None
        departure_info = None

        # Check for IMO number and departure time
        text_sections = event.find_all('p')
        for p in text_sections:
            if 'IMO number' in p.text:
                imo_number = p.text.split('IMO number: ')[1].split()[0] if 'IMO number' in p.text else None
            if 'Departure' in p.text:
                departure_info = event.find('ul').find('li').text.strip()

        # Create event info dictionary
        event_info = {
            'event_name': event_name,
            'picture_link': picture_link,
            'event_type': event_type,
            'date': date,
            'time': time,
            'adress': location,
            'city': 'Hamburg',
            'country': 'Germany',
            'ticket_price': "",
            'ticket_link': "",
            'source_website': base_url,
            'note': departure_info
        }

        events.append(event_info)

    # Move to the next page
    print(page)
    page += 1

# All events have been scraped
print(f"Total events scraped: {len(events)}")

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def generate_date_range(start_date, end_date):
    """Generate a list of date strings between start_date and end_date."""
    start = datetime.strptime(start_date, "%d.%m.%Y")
    end = datetime.strptime(end_date, "%d.%m.%Y")
    delta = timedelta(days=1)

    date_range = []
    while start <= end:
        date_range.append(start.strftime("%d.%m.%Y"))
        start += delta
    return date_range

def extract_event_details(event, base_url):
    """Extract details of a single event and return them as a dictionary."""
    event_name = event.find('h3').text if event.find('h3') else None
    event_type = event.find('ul', class_='listTeaser-event__text__profiling').text.strip() if event.find('ul', class_='listTeaser-event__text__profiling') else None
    date = event.find('span', class_='icon-calendar').next_sibling.strip() if event.find('span', class_='icon-calendar') else None
    time = event.find('span', class_='icon-clock').next_sibling.strip() if event.find('span', class_='icon-clock') else None
    location = event.find('span', class_='icon-located').next_sibling.strip() if event.find('span', class_='icon-located') else None
    picture_link = event.find('img')['src'] if event.find('img') else None
    imo_number = None
    departure_info = None

    text_sections = event.find_all('p')
    for p in text_sections:
        if 'IMO number' in p.text:
            imo_number = p.text.split('IMO number: ')[1].split()[0] if 'IMO number' in p.text else None
        if 'Departure' in p.text:
            departure_info = event.find('ul').find('li').text.strip()

    return {
        'event_name': event_name,
        'picture_link': picture_link,
        'event_type': event_type,
        'date': date,
        'time': time,
        'address': location,
        'city': 'Hamburg',
        'country': 'Germany',
        'ticket_price': "",
        'ticket_link': "",
        'source_website': base_url,
        'note': departure_info
    }

def scrape_events_for_date(date, base_url):
    """Scrape all events for a specific date and return them as a list."""
    page = 0
    events = []
    while True:
        url = f"https://www.hamburg-travel.com/see-explore/events/events-calendar/js.api?filter[district]=all&pageDate={date}&page={page}&filter[distance]=50"
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        hamburgEvents = soup.find_all('article', class_='listTeaser-event')

        if not hamburgEvents:
            break

        events += [extract_event_details(event, base_url) for event in hamburgEvents]
        page += 1
    return events

def scrape_events_between_dates(start_date, end_date, base_url):
    """Scrape events between start_date and end_date and return them as a list."""
    date_range = generate_date_range(start_date, end_date)
    all_events = []

    for date in date_range:
        daily_events = scrape_events_for_date(date, base_url)
        all_events += daily_events

    return all_events

# Usage
start_date = "06.10.2024"
end_date = "07.10.2024"
base_url = "https://www.hamburg-travel.com"

events = scrape_events_between_dates(start_date, end_date, base_url)
df = pd.DataFrame(events)
df


In [None]:
# df = pd.DataFrame(events)
# df.to_excel('hamburg_events.xlsx', index=False)
df

In [None]:
muenchenUrl = "https://www.muenchen.de/en/veranstaltungen/event?page=0"
res = requests.get(muenchenUrl)
soup = BeautifulSoup(res.text, 'html.parser')
muenchenEvents = soup.find_all('li', class_='m-listing__list-item')
len(muenchenEvents)

30

In [None]:
# @title
events = []
for event in muenchenEvents:
  # Extract event name
  event_name = event.find('h3', class_='m-event-list-item__headline').get_text(strip=True)

  # Extract start and end dates
  start_date = event.find('time', class_='m-date-range__item', itemprop='startDate')['datetime']
  end_date = event.find('time', class_='m-date-range__item', itemprop='endDate')['datetime']

  # Extract event day and time
  date_time_tag = event.find('time', datetime=True)
  event_datetime = date_time_tag['datetime'] if date_time_tag else None

  # Extract location
  location_tag = event.find('p', class_='m-event-list-item__detail', itemprop='location')
  location = location_tag.get_text(strip=True) if location_tag else None

  # Extract ticket link
  ticket_link_tag = event.find('a', class_='m-button', href=True)
  ticket_link = ticket_link_tag['href'] if ticket_link_tag else None

  event_info = {
      'event_name': event_name,
      'start_date': start_date,
      'end_date': end_date,
      'event_datetime': event_datetime,
      'location': location,
      'ticket_link': ticket_link
  }
  events.append(event_info)

events

In [2]:
base_url = "https://www.muenchen.de"
muenchenUrl = "https://www.muenchen.de/en/veranstaltungen/event?page={}"
page = 0
events = []

while True:
    # Fetch page content
    url = muenchenUrl.format(page)
    print(f"Fetching URL: {url}")
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    muenchenEvents = soup.find_all('li', class_='m-listing__list-item')

    if not muenchenEvents:  # Break loop if no more events are found
        print("No more events found.")
        break

    # Process the events on the current page
    for event in muenchenEvents:
        # Extract event name
        event_name = event.find('h3', class_='m-event-list-item__headline').get_text(strip=True) if event.find('h3', class_='m-event-list-item__headline') else None

        # Extract start and end dates
        start_date = event.find('time', class_='m-date-range__item', itemprop='startDate')['datetime'] if event.find('time', class_='m-date-range__item', itemprop='startDate') else None
        end_date = event.find('time', class_='m-date-range__item', itemprop='endDate')['datetime'] if event.find('time', class_='m-date-range__item', itemprop='endDate') else None

        # Extract event day and time
        date_time_tag = event.find('time', datetime=True)
        event_datetime = date_time_tag['datetime'] if date_time_tag else None

        # Extract location
        location_tag = event.find('p', class_='m-event-list-item__detail', itemprop='location')
        location = location_tag.get_text(strip=True) if location_tag else None

        # Extract ticket link
        ticket_link_tag = event.find('a', class_='m-button', href=True)
        ticket_link = ticket_link_tag['href'] if ticket_link_tag else None

        event_info = {
            'event_name': event_name,
            'picture_link': "",
            'event_type': "",
            'date': f'{start_date} - {end_date}',
            'time': event_datetime,
            'address': location,
            'city': "München",
            'country': "Germany",
            'ticket_price': "",
            'ticket_link': ticket_link,
            'source_website': base_url
        }
        events.append(event_info)

    # Move to the next page
    page += 1

# Output the final list of events
print(f"Total events found: {len(events)}")

Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=0
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=1
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=2
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=3
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=4
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=5
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=6
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=7
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=8
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=9
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=10
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=11
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=12
Fetching URL: https://www.muenchen.de/en/veranstaltungen/event?page=13
Fetching URL: ht

In [3]:
df = pd.DataFrame(events)
df.to_excel('muenchen_events.xlsx', index=False)
df

Unnamed: 0,event_name,picture_link,event_type,date,time,address,city,country,ticket_price,ticket_link,source_website
0,Sonderausstellung Erntedank: Von Erbsenzählern...,,,2024-09-28T12:00:00Z - 2024-10-06T12:00:00Z,2024-09-28T12:00:00Z,Botanischer Garten,München,Germany,,,https://www.muenchen.de
1,"Tintenfische, Teufelsfinger und Tentakel - die...",,,2024-08-13T12:00:00Z - 2024-12-30T12:00:00Z,2024-08-13T12:00:00Z,Paläontologisches Museum,München,Germany,,,https://www.muenchen.de
2,Münchner Outdoorsportfestival,,,2024-10-06T12:00:00Z - 2024-10-06T12:00:00Z,2024-10-06T12:00:00Z,Olympiapark,München,Germany,,,https://www.muenchen.de
3,Viktor&Rolf. Fashion Statements,,,2024-09-18T12:00:00Z - 2024-10-06T12:00:00Z,2024-09-18T12:00:00Z,Kunsthalle München,München,Germany,,,https://www.muenchen.de
4,Ägypten 1983 – Fotografien von Dirk Altenkirch,,,2024-09-13T12:00:00Z - 2024-10-20T12:00:00Z,2024-09-13T12:00:00Z,Museum Ägyptischer Kunst,München,Germany,,,https://www.muenchen.de
...,...,...,...,...,...,...,...,...,...,...,...
4652,Sonderkonzerte zu Silvester,,,2024-12-01T12:00:00Z - 2025-12-31T12:00:00Z,2024-12-01T12:00:00Z,Hofkapelle der Residenz,München,Germany,,https://www.muenchenticket.de/tickets/performa...,https://www.muenchen.de
4653,Sonderkonzerte zu Silvester,,,2024-12-01T12:00:00Z - 2025-12-31T12:00:00Z,2024-12-01T12:00:00Z,Hofkapelle der Residenz,München,Germany,,https://www.muenchenticket.de/tickets/performa...,https://www.muenchen.de
4654,BUSHIDO Alles wird gut - Tour 2026,,,2026-01-22T12:00:00Z - 2026-01-22T12:00:00Z,2026-01-22T12:00:00Z,Olympiahalle,München,Germany,,https://www.muenchenticket.de/tickets/performa...,https://www.muenchen.de
4655,HELENE FISCHER,,,2026-07-17T12:00:00Z - 2026-07-17T12:00:00Z,2026-07-17T12:00:00Z,Allianz Arena,München,Germany,,https://www.muenchenticket.de/tickets/performa...,https://www.muenchen.de
