Merge pull request #29 from auchtetraborat/main

Integration of VERENA-scraper & remove misplaced comment in Lebensmittelwarnung
bundesAPI · Oct 7, 2021 · 9846d23 · 9846d23
2 parents cac5953 + 91f05e9
commit 9846d23
Show file tree

Hide file tree

Showing 14 changed files with 635 additions and 2 deletions.
diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml
@@ -23,4 +23,4 @@ jobs:
           pip install .
       - name: Test with pytest
         run: |
-          pytest
+          pytest 
diff --git a/README.md b/README.md
@@ -74,6 +74,21 @@ print(data)
 # [{'id': 19601, 'guid': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/detail/lebensmittel/19601', 'pubDate': 'Fri, 10 Feb 2017 12:28:45 +0000', 'imgSrc': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/opensaga/attachment/979f8cd3-969e-4a6c-9a8e-4bdd61586cd4/data.jpg', 'title': 'Sidroga Bio Säuglings- und Kindertee', 'manufacturer': 'Lebensmittel', 'warning': 'Pyrrolizidinalkaloide', 'affectedStates': ['Baden-Württemberg', '...']}]
 ```
 
+## Federal Job Openings
+
+### NRW
+
+#### VERENA
+Get open substitute teaching positions in NRW from https://www.schulministerium.nrw.de/BiPo/Verena/angebote
+```python
+from deutschland import Verena
+v = Verena()
+data = v.get()
+print(data)
+# a full example data can be found at deutschland/verena/example.md
+# [{ "school_id": "99999", "desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen", "replacement_job_title": "Lehrkraft", "subjects": [ "Fach 1", "Fach 2" ], "comments": "Bemerkung zur Stelle: Testbemerkung", "duration": "01.01.2021 - 01.01.2022", ...} ...]
+
+
 ## Autobahn
 
 Get data from the Autobahn.

diff --git a/deutschland/__init__.py b/deutschland/__init__.py
@@ -6,4 +6,5 @@
 from .bundesanzeiger.bundesanzeiger import Bundesanzeiger
 from .handelsregister.handelsregister import Handelsregister
 from .lebensmittelwarnung.lebensmittelwarnung import Lebensmittelwarnung
+from .verena.verena import Verena
 from .bundesnetzagentur import *
diff --git a/deutschland/lebensmittelwarnung/lebensmittelwarnung.py b/deutschland/lebensmittelwarnung/lebensmittelwarnung.py
@@ -177,6 +177,5 @@ def get(
 
 if __name__ == "__main__":
     lw = Lebensmittelwarnung()
-    # res = hr.search(keywords="Deutsche Bahn Aktiengesellschaft", keyword_match_option=3)
     res = lw.get()
     print(res)
diff --git a/deutschland/verena/__init__.py b/deutschland/verena/__init__.py
diff --git a/deutschland/verena/example.md b/deutschland/verena/example.md
@@ -0,0 +1,34 @@
+### Scraper for https://www.schulministerium.nrw.de/BiPo/Verena/online
+
+```json
+{
+    "school_id": "99999",
+    "desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen",
+    "replacement_job_title": "Lehrkraft",
+    "subjects": [
+        "Fach 1",
+        "Fach 2"
+    ],
+    "replacement_job_type_raw": "Vertretung für",
+    "replacement_job_type": "Vertretung",
+    "comments": "Bemerkung zur Stelle: Testbemerkung",
+    "duration": "01.01.2021 - 01.01.2022",
+    "hours_per_week": "13,5",
+    "contact": {
+        "phone": "0172 1111 1111",
+        "fax": "0172 2222 2222",
+        "homepage": "http://www.eine-schule.de",
+        "mail": {
+            "raw": "mailto:bewerbung@eineschule.de?subject=Stellenausschreibung in VERENA",
+            "adress": "bewerbung@eineschule.de",
+            "subject": "Stellenausschreibung in VERENA"
+        }
+    },
+    "deadline": "17.09.2021",
+    "geolocation": {
+        "coord_system": "epsg:25832",
+        "coordinates": [1111111, 1111111],
+        "post_adress": "Eine Stra\u00dfe 1\n99999 Schulingen"
+    }
+}
+```
diff --git a/deutschland/verena/verena.py b/deutschland/verena/verena.py
@@ -0,0 +1,28 @@
+from deutschland.verena.verenadownloader import VerenaDownloader
+from deutschland.verena.verenaextractor import VerenaExtractor
+import json
+
+
+class Verena:
+    """
+    Downloads and extracts the current job listings from the VERENA portal.
+    """
+
+    def get(self):
+        """
+        Downloads and extracts the current job listings from the VERENA portal.
+
+        Example of the json format can be found at ./example.json
+        """
+        result = []
+        scraped_pages = VerenaDownloader().scrape()
+        for idx, page in enumerate(scraped_pages):
+            extract = VerenaExtractor(page).extract()
+            result = result + extract
+        return result
+
+
+if __name__ == "__main__":
+    v = Verena()
+    res = v.get()
+    print(json.dumps(res))
diff --git a/deutschland/verena/verenadownloader.py b/deutschland/verena/verenadownloader.py
@@ -0,0 +1,142 @@
+import requests
+import math
+from bs4 import BeautifulSoup
+from typing import Tuple, List
+
+
+class VerenaDownloader:
+    """
+    Downloads all pages (each containing 100 job offerings) of the VERENA portal.
+    """
+
+    BASE_URL = "https://www.schulministerium.nrw.de"
+
+    def __init__(self):
+        self.session = requests.Session()
+
+    def __scrape_landing_page(self) -> Tuple[int, str, str]:
+        """Returns (job_openings_count: int, access_listing_url_part: str, access_listing_action_id: str)
+
+        Example: (513, "/BiPo/Verena/angebote?action=595.1764087184088", "595.1764087184088")
+
+        Scrapes the VERENA landing page to get a session cookie, matching actionid
+        to access the listing view and the count of job offerings in the listing.
+        """
+        landing_url = self.BASE_URL + "/BiPo/Verena"
+        landing_request = self.session.get(landing_url)
+        landing_soup = BeautifulSoup(landing_request.text, "html.parser")
+        links = landing_soup.findAll("a", {"title": "Zu den Stellenausschreibungen"})
+        for link in links:
+            if "Derzeit im Netz veröffentlichte Ausschreibungen:" in link.text:
+                job_openings_count = link.find_next("strong").text
+                access_listing_url_part = link["href"]
+                # split action_id from listing_url_part
+                access_listing_action_id = access_listing_url_part.replace(
+                    "/BiPo/Verena/angebote?action=", ""
+                )
+                return (
+                    int(job_openings_count),
+                    access_listing_url_part,
+                    access_listing_action_id,
+                )
+
+    def __scrape_listing_page_initial(
+        self, access_listing_url_part: str
+    ) -> Tuple[str, str, str]:
+        """Returns (listing url with new actionid, blocksize 100 & valid suchid (aka. select_blocksize_url_part)), search_id, select_blocksize_action_id)
+
+        Example: ("/BiPo/Verena/angebote?action=509.9848906326322&block=b100&suchid=188736", "188736", "509.9848906326322")
+
+        Scrapes the VERENA listing page to get a listing url with blocksize = 100 and valid suchid (search_id).
+        suchid is generated by the backend and stores your search preferences.
+        """
+        listing_url = self.BASE_URL + access_listing_url_part
+        listing_request = self.session.get(listing_url)
+        listing_soup = BeautifulSoup(listing_request.text, "html.parser")
+        blocksize_selector = listing_soup.find("div", id="blockauswahl")
+        # -1 is blocksize 100, also gets a such_id (search_id)
+        select_blocksize_url_part = blocksize_selector.findAll("a")[-1]["href"]
+        search_id = select_blocksize_url_part.split("=")[-1]
+        select_blocksize_action_id = select_blocksize_url_part.replace(
+            "/BiPo/Verena/angebote?action=", ""
+        ).split("&")[0]
+        return select_blocksize_url_part, search_id, select_blocksize_action_id
+
+    def __set_block_size(self, select_blocksize_url_part: str):
+        """
+        Run GET on search ID url to set correct block size for future requests in backend
+        """
+        searchid_url = self.BASE_URL + select_blocksize_url_part
+        self.session.get(searchid_url)
+
+    def __generate_all_listing_urls(
+        self, action_id: str, search_id: str, opening_count: int
+    ) -> List[str]:
+        """Based on action_id, search_id and opening_count, generates a list of all listing urls.
+
+        Example: [
+            "https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a1&suchid=188265",
+            "https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a2&suchid=188265"
+            ...
+            ]
+        """
+        all_urls = []
+        # because block size = 100
+        site_count = math.ceil(opening_count / 100)
+        for curr_site in range(0, site_count):
+            curr_site += 1
+            listing_format_string = (
+                self.BASE_URL + "/BiPo/Verena/angebote?action={0}&seite=a{1}&suchid={2}"
+            )
+            all_urls.append(
+                listing_format_string.format(action_id, curr_site, search_id)
+            )
+        return all_urls
+
+    def __scrape_actual_listing(self, urls: List[str]):
+        """Downloads the job listing pages provided by 'urls' and returns their content as an list of sourcecodes.
+
+        Example: [
+            <html>...</html>
+            <html>...</html>
+        ]
+
+        """
+        scraped_pages = []
+        for url in urls:
+            r = self.session.get(url)
+            scraped_pages.append(r.text)
+        return scraped_pages
+
+    def scrape(self) -> List[str]:
+        """Returns list of sourcecodes of all listing pages of the VERENA job listing portal.
+
+        Example: [
+            <html>...</html>
+            <html>...</html>
+        ]
+
+        """
+        (
+            job_opening_count,
+            access_listing_url_part,
+            access_listing_action_id,
+        ) = self.__scrape_landing_page()
+        # select_blocksize_action_id is the action_id used to select the blocksize.
+        # Its also reused to query the different pages of the job portal.
+        (
+            select_blocksize_url_part,
+            search_id,
+            select_blocksize_action_id,
+        ) = self.__scrape_listing_page_initial(access_listing_url_part)
+        self.__set_block_size(select_blocksize_url_part)
+        all_listing_urls = self.__generate_all_listing_urls(
+            select_blocksize_action_id, search_id, job_opening_count
+        )
+        return self.__scrape_actual_listing(all_listing_urls)
+
+
+if __name__ == "__main__":
+    vd = VerenaDownloader()
+    res = vd.scrape()
+    print(res)