# Parser

In [2]:
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd

input_file = 'input_data/bookings.html'

with open(input_file, 'r', encoding='utf-8') as infile:
    input_text = infile.read()

soup = BeautifulSoup(input_text, 'lxml')


bookings = soup.findAll('div', {'class': 'e83c0a1d59'})

## Extract Bookings

In [3]:
bookings_overview = []
for booking in bookings:
    # Hotel
    hotel = booking.find('h3').text
    
    # find dates, number of rooms and place
    date_rooms_place = booking.find('div', {'class': 'f660aace8b a914c714f2'})

    spans = date_rooms_place.findAll('span')
    # Unpack date, rooms (if available), and place
    date = spans[0].text
    place = spans[-1].text  # Always the last span
    rooms = spans[1].text.split()[0] if len(spans) > 2 else 1  # Only if available

    # split and parse dates 
    date_from_str, date_to_str = date.replace(' ', '').split('–')
    date_from = datetime.strptime(date_from_str, '%d%b%Y')
    date_to = datetime.strptime(date_to_str, '%d%b%Y')

    # extract status and link
    status = date_rooms_place.nextSibling.text
    link = booking.find('a').get('href')

    # hier folgt in der Testumgebung die Waehrungsbezeichnung auf die Zahl. In der Echtanwendung ist es andersrum! also [1]!
    price = booking.find('div', {'class': 'f6431b446c f660aace8b'}).text.split()[0]

    # print formatted output
    bookings_overview.append([place, hotel, date_from.strftime('%Y-%m-%d'), date_to.strftime('%Y-%m-%d'), rooms, status, price, link])

## Store Bookings to CSV

In [4]:
bookings_df = pd.DataFrame(bookings_overview, columns=['Ort', 'Hotel', 'Von', 'Bis', 'Zimmer', 'Status', 'Preis', 'Link'])
bookings_df.to_csv('output_data/bookings_overview.csv', index=False)