Merge pull request #11 from K0IN/main

Removed the need of selenium for bundesanzeiger, to ship less and more portable code
bundesAPI · Aug 11, 2021 · 067828c · 067828c
2 parents cf37b83 + 0e3c280
commit 067828c
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 118 deletions.
diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml
@@ -0,0 +1,26 @@
+name: Run Python 🐍 tests
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8, 3.9]
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest
+          pip install .
+      - name: Test with pytest
+        run: |
+          pytest
diff --git a/README.md b/README.md
@@ -30,8 +30,6 @@ print(data["Adresse"][0])
 ### Bundesanzeiger
 Get financial reports for all german companies that are reporting to Bundesanzeiger.
 
-*Please note: For now [Google Chrome](https://www.google.com/chrome/) is required in order to use the Bundesanzeiger API. Please feel free to add [support for other browsers](https://github.com/SergeyPirogov/webdriver_manager).*
-
 ```python
 from deutschland import Bundesanzeiger
 ba = Bundesanzeiger()
@@ -42,4 +40,3 @@ print(data.keys())
 # dict_keys(['Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020', 'Konzernabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020\nErgänzung der Veröffentlichung vom 04.06.2021',
 ```
 *Big thanks to Nico Duldhardt and Friedrich Schöne, who [supported this implementation with their machine learning model](https://av.tib.eu/media/52366).*
-
diff --git a/deutschland/bundesanzeiger/bundesanzeiger.py b/deutschland/bundesanzeiger/bundesanzeiger.py
@@ -2,141 +2,159 @@
 import requests
 
 import dateparser
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.common.exceptions import NoSuchElementException
+
 from bs4 import BeautifulSoup
 
 
-from selenium import webdriver
-from selenium.webdriver.common.keys import Keys
+class Report:
+    __slots__ = ["date", "name", "content_url", "company", "report"]
+
+    def __init__(self, date, name, content_url, company, report=None):
+        self.date = date
+        self.name = name
+        self.content_url = content_url
+        self.company = company
+        self.report = report
 
-from deutschland.bundesanzeiger.model import (
-    load_image_arr,
-    load_model,
-    prediction_to_str,
-)
-from webdriver_manager.chrome import ChromeDriverManager
+    def to_dict(self):
+        return {
+            "date": self.date,
+            "name": self.name,
+            "company": self.company,
+            "report": self.report,
+        }
 
 
 class Bundesanzeiger:
-    requests_settings = {
-        "headers": {
-            "Connection": "keep-alive",
-            "sec-ch-ua": '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
-            "sec-ch-ua-mobile": "?0",
-            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
-            "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
-            "Sec-Fetch-Site": "same-origin",
-            "Sec-Fetch-Mode": "no-cors",
-            "Sec-Fetch-Dest": "image",
-            "Referer": "https://www.bundesanzeiger.de/",
-            "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
-        },
-        "timeout": 1,
-    }
-
-    def __init__(self):
-        options = Options()
-        options.add_argument("--headless")
-        options.add_argument("--no-sandbox")
-        options.add_argument("--single-process")
-        options.add_argument("--disable-dev-shm-usage")
-
-        self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
-        self.model = load_model()
-
-    def __solve_captcha(self, image):
-        image = BytesIO(image)
-        image_arr = load_image_arr(image)
+    __slots__ = ["session", "model", "captcha_callback"]
+
+    def __init__(self, on_captach_callback=None):
+        self.session = requests.Session()
+        if on_captach_callback:
+            self.callback = on_captach_callback
+        else:
+            import deutschland.bundesanzeiger.model
+
+            self.model = deutschland.bundesanzeiger.model.load_model()
+            self.captcha_callback = self.__solve_captcha
+
+    def __solve_captcha(self, image_data: bytes):
+        import deutschland.bundesanzeiger.model
+
+        image = BytesIO(image_data)
+        image_arr = deutschland.bundesanzeiger.model.load_image_arr(image)
         image_arr = image_arr.reshape((1, 50, 250, 1))
 
         prediction = self.model.predict(image_arr)[0]
-        prediction_str = prediction_to_str(prediction)
+        prediction_str = deutschland.bundesanzeiger.model.prediction_to_str(prediction)
+
         return prediction_str
 
-    def __get_report(self, itm):
-        report = {
-            "date": dateparser.parse(
-                itm.find_element(By.CLASS_NAME, "date").text, languages=["de"]
-            ),
-            "name": itm.find_element(By.CLASS_NAME, "info")
-            .find_element(By.XPATH, "a[1]")
-            .text,
-            "company": itm.find_element(By.CLASS_NAME, "first").text,
-            "report": None,
-        }
+    def __is_captcha_needed(self, entry_content: str):
+        soup = BeautifulSoup(entry_content, "html.parser")
+        return not bool(soup.find("div", {"class": "publication_container"}))
+
+    def __find_all_entries_on_page(self, page_content: str):
+        soup = BeautifulSoup(page_content, "html.parser")
+        wrapper = soup.find("div", {"class": "result_container"})
+        rows = wrapper.find_all("div", {"class": "row"})
+        for row in rows:
+            info_element = row.find("div", {"class": "info"})
+            if not info_element:
+                continue
 
-        itm.find_element(By.CLASS_NAME, "info").find_element(By.XPATH, "a[1]").click()
-        try:
-            image_src = self.driver.find_element(
-                By.CLASS_NAME, "captcha_wrapper"
-            ).get_attribute("innerHTML")
-            bs_src = BeautifulSoup(image_src, "html.parser")
-            captcha_img_link = bs_src.find(lambda tag: tag.name == "img")["src"]
-            cookies = {}
-            for cookie in self.driver.get_cookies():
-                cookies[cookie["name"]] = cookie["value"]
-            captcha_img_resp = requests.get(
-                captcha_img_link, cookies=cookies, **self.requests_settings
-            )
-            captcha = self.__solve_captcha(captcha_img_resp.content)
-            # send captcha
-            captcha_field = self.driver.find_element(By.XPATH, '//*[@name="solution"]')
-            captcha_field.send_keys(captcha)
-            captcha_field.send_keys(Keys.ENTER)
-        except NoSuchElementException:
-            pass
-
-        try:
-            # extract html
-            report["report"] = self.driver.find_element(
-                By.CLASS_NAME, "publication_container"
-            ).get_attribute("innerHTML")
-        except Exception:
-            return None
-        self.driver.find_element(
-            By.XPATH,
-            '//*[@id="content"]/section/div/div/div/div/div[1]/div[2]/div/div[1]/a',
-        ).click()
-        return report
-
-    def __iterate_trough_search_results(self, found_reports=None):
+            link_element = info_element.find("a")
+            if not link_element:
+                continue
+
+            entry_link = link_element.get("href")
+            entry_name = link_element.contents[0].strip()
+
+            date_element = row.find("div", {"class": "date"})
+            if not date_element:
+                continue
+
+            date = dateparser.parse(date_element.contents[0], languages=["de"])
+
+            company_name_element = row.find("div", {"class": "first"})
+            if not date_element:
+                continue
+
+            company_name = company_name_element.contents[0].strip()
+
+            yield Report(date, entry_name, entry_link, company_name)
+
+    def __generate_result(self, content: str):
         """iterate trough all results and try to fetch single reports"""
-        if found_reports is None:
-            found_reports = {}
-        for itm in self.driver.find_elements(
-            By.XPATH, '//*[@id="content"]/section[2]/div/div/div/div/div[6]/div'
-        ):
-            document_name = None
-            try:
-                document_name = itm.find_element(By.CLASS_NAME, "info").text
-            except NoSuchElementException:
+        result = {}
+        for element in self.__find_all_entries_on_page(content):
+            get_element_response = self.session.get(element.content_url)
+
+            if self.__is_captcha_needed(get_element_response.text):
+                soup = BeautifulSoup(get_element_response.text, "html.parser")
+                captcha_image_src = soup.find("div", {"class": "captcha_wrapper"}).find(
+                    "img"
+                )["src"]
+                img_response = self.session.get(captcha_image_src)
+                captcha_result = self.captcha_callback(img_response.content)
+                captcha_endpoint_url = soup.find_all("form")[1]["action"]
+                get_element_response = self.session.post(
+                    captcha_endpoint_url,
+                    data={"solution": captcha_result, "confirm-button": "OK"},
+                )
+
+            content_soup = BeautifulSoup(get_element_response.text, "html.parser")
+            content_element = content_soup.find(
+                "div", {"class": "publication_container"}
+            )
+
+            if not content_element:
                 continue
 
-            if document_name and document_name not in found_reports:
-                report = self.__get_report(itm)
-                if report:
-                    found_reports[document_name] = report
-                return self.__iterate_trough_search_results(found_reports)
-        return found_reports
+            element.report = content_element.text
+            result[element.name] = element.to_dict()
+
+        return result
 
     def get_reports(self, company_name: str):
         """
         fetch all reports for this company name
         :param company_name:
-        :return: Dict of all reports
+        :return" : "Dict of all reports
         """
-        self.driver.get("https://www.bundesanzeiger.de/ebanzwww/wexsservlet")
-        elem = self.driver.find_element_by_id("cc_all")
-        elem.click()
-        elem = self.driver.find_element_by_id("id3")
-        elem.send_keys(company_name)
-        elem.send_keys(Keys.ENTER)
-        return self.__iterate_trough_search_results()
+        self.session.cookies["cc"] = "1628606977-805e172265bfdbde-10"
+        self.session.headers.update(
+            {
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+                "Accept-Encoding": "gzip, deflate, br",
+                "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5",
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "DNT": "1",
+                "Host": "www.bundesanzeiger.de",
+                "Pragma": "no-cache",
+                "Referer": "https://www.bundesanzeiger.de/",
+                "sec-ch-ua-mobile": "?0",
+                "Sec-Fetch-Dest": "document",
+                "Sec-Fetch-Mode": "navigate",
+                "Sec-Fetch-Site": "same-origin",
+                "Sec-Fetch-User": "?1",
+                "Upgrade-Insecure-Requests": "1",
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
+            }
+        )
+        # get the jsessionid cookie
+        response = self.session.get("https://www.bundesanzeiger.de")
+        # go to the start page
+        response = self.session.get("https://www.bundesanzeiger.de/pub/de/start?0")
+        # perform the search
+        response = self.session.get(
+            f"https://www.bundesanzeiger.de/pub/de/start?0-2.-top%7Econtent%7Epanel-left%7Ecard-form=&fulltext={company_name}&area_select=&search_button=Suchen"
+        )
+        return self.__generate_result(response.text)
 
 
 if __name__ == "__main__":
     ba = Bundesanzeiger()
     reports = ba.get_reports("Deutsche Bahn AG")
-    print(reports.keys())
+    print(reports.keys(), len(reports))
diff --git a/deutschland/bundesanzeiger/model.py b/deutschland/bundesanzeiger/model.py
@@ -5,8 +5,6 @@
 import numpy as np
 from PIL import Image
 
-ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789")
-
 
 def load_image_arr(fp):
     image = Image.open(fp).convert("L")
@@ -17,6 +15,7 @@ def load_image_arr(fp):
 
 
 def character_indexes_to_str(character_indexes):
+    ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789")
     characters = np.array(ALPHABET)[character_indexes]
     return "".join(list(characters)).upper()
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,12 +15,10 @@ mapbox-vector-tile = "^1.2.1"
 requests = "^2.26.0"
 dateparser = "^1.0.0"
 gql = "^2.0.0"
-selenium = "^3.141.0"
 boto3 = "^1.18.9"
 slugify = "^0.0.1"
 tensorflow = "^2.5.0"
 Pillow = "^8.3.1"
-webdriver-manager = "^3.4.2"
 beautifulsoup4 = "^4.9.3"
 
 [tool.poetry.urls]

diff --git a/tests/integration_test.py b/tests/integration_test.py
@@ -0,0 +1,7 @@
+from deutschland import Bundesanzeiger
+
+
+def test_for_no_data_deutsche_bahn_ag():
+    ba = Bundesanzeiger()
+    data = ba.get_reports("Deutsche Bahn AG")
+    assert len(data.keys()) > 0, "Found no reports for Deutsche Bahn AG"