Skip to content

Commit

Permalink
Merge pull request #11 from K0IN/main
Browse files Browse the repository at this point in the history
Removed the need of selenium for bundesanzeiger, to ship less and more portable code
  • Loading branch information
LilithWittmann committed Aug 11, 2021
2 parents cf37b83 + 0e3c280 commit 067828c
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 118 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/runtests.yml
@@ -0,0 +1,26 @@
name: Run Python 🐍 tests

on: [push]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
pip install .
- name: Test with pytest
run: |
pytest
3 changes: 0 additions & 3 deletions README.md
Expand Up @@ -30,8 +30,6 @@ print(data["Adresse"][0])
### Bundesanzeiger
Get financial reports for all german companies that are reporting to Bundesanzeiger.

*Please note: For now [Google Chrome](https://www.google.com/chrome/) is required in order to use the Bundesanzeiger API. Please feel free to add [support for other browsers](https://github.com/SergeyPirogov/webdriver_manager).*

```python
from deutschland import Bundesanzeiger
ba = Bundesanzeiger()
Expand All @@ -42,4 +40,3 @@ print(data.keys())
# dict_keys(['Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020', 'Konzernabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020\nErgänzung der Veröffentlichung vom 04.06.2021',
```
*Big thanks to Nico Duldhardt and Friedrich Schöne, who [supported this implementation with their machine learning model](https://av.tib.eu/media/52366).*

240 changes: 129 additions & 111 deletions deutschland/bundesanzeiger/bundesanzeiger.py
Expand Up @@ -2,141 +2,159 @@
import requests

import dateparser
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

from bs4 import BeautifulSoup


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class Report:
__slots__ = ["date", "name", "content_url", "company", "report"]

def __init__(self, date, name, content_url, company, report=None):
self.date = date
self.name = name
self.content_url = content_url
self.company = company
self.report = report

from deutschland.bundesanzeiger.model import (
load_image_arr,
load_model,
prediction_to_str,
)
from webdriver_manager.chrome import ChromeDriverManager
def to_dict(self):
return {
"date": self.date,
"name": self.name,
"company": self.company,
"report": self.report,
}


class Bundesanzeiger:
requests_settings = {
"headers": {
"Connection": "keep-alive",
"sec-ch-ua": '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
"sec-ch-ua-mobile": "?0",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
"Referer": "https://www.bundesanzeiger.de/",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
},
"timeout": 1,
}

def __init__(self):
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--single-process")
options.add_argument("--disable-dev-shm-usage")

self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
self.model = load_model()

def __solve_captcha(self, image):
image = BytesIO(image)
image_arr = load_image_arr(image)
__slots__ = ["session", "model", "captcha_callback"]

def __init__(self, on_captach_callback=None):
self.session = requests.Session()
if on_captach_callback:
self.callback = on_captach_callback
else:
import deutschland.bundesanzeiger.model

self.model = deutschland.bundesanzeiger.model.load_model()
self.captcha_callback = self.__solve_captcha

def __solve_captcha(self, image_data: bytes):
import deutschland.bundesanzeiger.model

image = BytesIO(image_data)
image_arr = deutschland.bundesanzeiger.model.load_image_arr(image)
image_arr = image_arr.reshape((1, 50, 250, 1))

prediction = self.model.predict(image_arr)[0]
prediction_str = prediction_to_str(prediction)
prediction_str = deutschland.bundesanzeiger.model.prediction_to_str(prediction)

return prediction_str

def __get_report(self, itm):
report = {
"date": dateparser.parse(
itm.find_element(By.CLASS_NAME, "date").text, languages=["de"]
),
"name": itm.find_element(By.CLASS_NAME, "info")
.find_element(By.XPATH, "a[1]")
.text,
"company": itm.find_element(By.CLASS_NAME, "first").text,
"report": None,
}
def __is_captcha_needed(self, entry_content: str):
soup = BeautifulSoup(entry_content, "html.parser")
return not bool(soup.find("div", {"class": "publication_container"}))

def __find_all_entries_on_page(self, page_content: str):
soup = BeautifulSoup(page_content, "html.parser")
wrapper = soup.find("div", {"class": "result_container"})
rows = wrapper.find_all("div", {"class": "row"})
for row in rows:
info_element = row.find("div", {"class": "info"})
if not info_element:
continue

itm.find_element(By.CLASS_NAME, "info").find_element(By.XPATH, "a[1]").click()
try:
image_src = self.driver.find_element(
By.CLASS_NAME, "captcha_wrapper"
).get_attribute("innerHTML")
bs_src = BeautifulSoup(image_src, "html.parser")
captcha_img_link = bs_src.find(lambda tag: tag.name == "img")["src"]
cookies = {}
for cookie in self.driver.get_cookies():
cookies[cookie["name"]] = cookie["value"]
captcha_img_resp = requests.get(
captcha_img_link, cookies=cookies, **self.requests_settings
)
captcha = self.__solve_captcha(captcha_img_resp.content)
# send captcha
captcha_field = self.driver.find_element(By.XPATH, '//*[@name="solution"]')
captcha_field.send_keys(captcha)
captcha_field.send_keys(Keys.ENTER)
except NoSuchElementException:
pass

try:
# extract html
report["report"] = self.driver.find_element(
By.CLASS_NAME, "publication_container"
).get_attribute("innerHTML")
except Exception:
return None
self.driver.find_element(
By.XPATH,
'//*[@id="content"]/section/div/div/div/div/div[1]/div[2]/div/div[1]/a',
).click()
return report

def __iterate_trough_search_results(self, found_reports=None):
link_element = info_element.find("a")
if not link_element:
continue

entry_link = link_element.get("href")
entry_name = link_element.contents[0].strip()

date_element = row.find("div", {"class": "date"})
if not date_element:
continue

date = dateparser.parse(date_element.contents[0], languages=["de"])

company_name_element = row.find("div", {"class": "first"})
if not date_element:
continue

company_name = company_name_element.contents[0].strip()

yield Report(date, entry_name, entry_link, company_name)

def __generate_result(self, content: str):
"""iterate trough all results and try to fetch single reports"""
if found_reports is None:
found_reports = {}
for itm in self.driver.find_elements(
By.XPATH, '//*[@id="content"]/section[2]/div/div/div/div/div[6]/div'
):
document_name = None
try:
document_name = itm.find_element(By.CLASS_NAME, "info").text
except NoSuchElementException:
result = {}
for element in self.__find_all_entries_on_page(content):
get_element_response = self.session.get(element.content_url)

if self.__is_captcha_needed(get_element_response.text):
soup = BeautifulSoup(get_element_response.text, "html.parser")
captcha_image_src = soup.find("div", {"class": "captcha_wrapper"}).find(
"img"
)["src"]
img_response = self.session.get(captcha_image_src)
captcha_result = self.captcha_callback(img_response.content)
captcha_endpoint_url = soup.find_all("form")[1]["action"]
get_element_response = self.session.post(
captcha_endpoint_url,
data={"solution": captcha_result, "confirm-button": "OK"},
)

content_soup = BeautifulSoup(get_element_response.text, "html.parser")
content_element = content_soup.find(
"div", {"class": "publication_container"}
)

if not content_element:
continue

if document_name and document_name not in found_reports:
report = self.__get_report(itm)
if report:
found_reports[document_name] = report
return self.__iterate_trough_search_results(found_reports)
return found_reports
element.report = content_element.text
result[element.name] = element.to_dict()

return result

def get_reports(self, company_name: str):
"""
fetch all reports for this company name
:param company_name:
:return: Dict of all reports
:return" : "Dict of all reports
"""
self.driver.get("https://www.bundesanzeiger.de/ebanzwww/wexsservlet")
elem = self.driver.find_element_by_id("cc_all")
elem.click()
elem = self.driver.find_element_by_id("id3")
elem.send_keys(company_name)
elem.send_keys(Keys.ENTER)
return self.__iterate_trough_search_results()
self.session.cookies["cc"] = "1628606977-805e172265bfdbde-10"
self.session.headers.update(
{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.bundesanzeiger.de",
"Pragma": "no-cache",
"Referer": "https://www.bundesanzeiger.de/",
"sec-ch-ua-mobile": "?0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
}
)
# get the jsessionid cookie
response = self.session.get("https://www.bundesanzeiger.de")
# go to the start page
response = self.session.get("https://www.bundesanzeiger.de/pub/de/start?0")
# perform the search
response = self.session.get(
f"https://www.bundesanzeiger.de/pub/de/start?0-2.-top%7Econtent%7Epanel-left%7Ecard-form=&fulltext={company_name}&area_select=&search_button=Suchen"
)
return self.__generate_result(response.text)


if __name__ == "__main__":
ba = Bundesanzeiger()
reports = ba.get_reports("Deutsche Bahn AG")
print(reports.keys())
print(reports.keys(), len(reports))
3 changes: 1 addition & 2 deletions deutschland/bundesanzeiger/model.py
Expand Up @@ -5,8 +5,6 @@
import numpy as np
from PIL import Image

ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789")


def load_image_arr(fp):
image = Image.open(fp).convert("L")
Expand All @@ -17,6 +15,7 @@ def load_image_arr(fp):


def character_indexes_to_str(character_indexes):
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789")
characters = np.array(ALPHABET)[character_indexes]
return "".join(list(characters)).upper()

Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Expand Up @@ -15,12 +15,10 @@ mapbox-vector-tile = "^1.2.1"
requests = "^2.26.0"
dateparser = "^1.0.0"
gql = "^2.0.0"
selenium = "^3.141.0"
boto3 = "^1.18.9"
slugify = "^0.0.1"
tensorflow = "^2.5.0"
Pillow = "^8.3.1"
webdriver-manager = "^3.4.2"
beautifulsoup4 = "^4.9.3"

[tool.poetry.urls]
Expand Down
7 changes: 7 additions & 0 deletions tests/integration_test.py
@@ -0,0 +1,7 @@
from deutschland import Bundesanzeiger


def test_for_no_data_deutsche_bahn_ag():
ba = Bundesanzeiger()
data = ba.get_reports("Deutsche Bahn AG")
assert len(data.keys()) > 0, "Found no reports for Deutsche Bahn AG"

0 comments on commit 067828c

Please sign in to comment.