# Webscraping
In this step we will scrape the relevant information from the website to provide it to our RAG.

In [1]:
import requests
from bs4 import BeautifulSoup


In [27]:
from tokenize import group


def scrape_website_for_offers(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find relevant divs
    relevant_divs = soup.find_all("div", class_="zmitem")

    grouped_content = []

    for item in relevant_divs:
        paragraphs = item.find_all(["p", "span"])
        item_text = "\n".join([p.get_text().strip() for p in paragraphs])

        grouped_content.append(item_text)
    
    if grouped_content:
        return "\n\n---NEW ITEM---".join(grouped_content)
    else: 
        return ""

# Define the URL of the website to scrape
offer_url_page1 = 'https://www.besigheim.de/kultur-und-tourismus/besigheimer-winzerfest-2025/staende+datenbank'
offer_url_page2 = 'https://www.besigheim.de/site/Besigheim-2023/node/23823496/page2/page2?zm.sid=zmc27esqnht1'
offer_url_page3 = 'https://www.besigheim.de/site/Besigheim-2023/node/23823496/page3/page3?zm.sid=zmc27esqnht1'

offers = scrape_website_for_offers(offer_url_page1)
offers += scrape_website_for_offers(offer_url_page2)
offers += scrape_website_for_offers(offer_url_page3)


In [28]:
offers_document = {"content": offers, "source": "Stände"}

In [29]:
offers_document

{'content': '01\nGenusskonzept GmbH & Co.KG\nA Freitag, B Samstag, C Sonntag, D Montag\nA Freitag, B Samstag, C Sonntag, D Montag\nAuf dem Kelterplatz\n\nAngebot\nImbissgerichte, schwäbisch\n\n---NEW ITEM---02\nGenusskonzept GmbH & Co.KG\nA Freitag, B Samstag, C Sonntag, D Montag, Gewölbekeller, Musik\nA Freitag, B Samstag, C Sonntag, D Montag, Gewölbekeller, Musik\nFASSKELLER\nAngebot\nGetränke\nMusik\nBands und DJ\n\n---NEW ITEM---03\nChorgemeinschaft Besigheim\nC Sonntag\nC Sonntag\nVereinszimmer Alte Kelter\nAngebot\nKaffee + Kuchen von 14 bis 17 Uhr\n\n---NEW ITEM---04\nTSV Ottmarsheim, EK Besigheim Handball e.V., RSV Besigheim, HHC Walheim-Besigheim, Spvgg Besigheim, Chorgemeinschaft Besigheim\nA Freitag, B Samstag, Bühne, C Sonntag, D Montag, Musik, Weinprobierstand\nA Freitag, B Samstag, Bühne, C Sonntag, D Montag, Musik, Weinprobierstand\nWeindorf unterm Schirm\nKelterplatz\nAngebot\n6 Weinstände mit unterschiedlichen Weinen der Felsengartenkellerei + Champagner aus Ay\nMusik\

In [30]:
import json

# Save offers as JSON (structured format)
with open('data/scraped_offers.json', 'w', encoding='utf-8') as f:
    json.dump(offers_document, f, ensure_ascii=False, indent=2)

# Save offers as plain text (easy to read)
with open('data/scraped_offers.txt', 'w', encoding='utf-8') as f:
    f.write(offers)

print("Data saved to:")
print("- data/scraped_offers.json (structured format with metadata)")
print("- data/scraped_offers.txt (plain text format)")
print(f"Total characters scraped: {len(offers)}")

Data saved to:
- data/scraped_offers.json (structured format with metadata)
- data/scraped_offers.txt (plain text format)
Total characters scraped: 11614


# Loading PDF files and transforming to text

In [31]:
from pypdf import PdfReader

In [32]:
reader = PdfReader("data/programm_focused.pdf")
program = ""

for page in reader.pages: 
        text = page.extract_text()
        if text:
            program += text


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)


In [35]:
program_document = {"content": program, "source": "Programm"}

In [None]:
# Save program as JSON (structured format)
with open('../data/program.json', 'w', encoding='utf-8') as f:
    json.dump(program_document, f, ensure_ascii=False, indent=2)

# Save program as plain text (easy to read)
with open('../data/program.txt', 'w', encoding='utf-8') as f:
    f.write(program)