Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed the need of selenium for bundesanzeiger, to ship less and more portable code #11

Merged
merged 4 commits into from Aug 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/runtests.yml
@@ -0,0 +1,26 @@
name: Run Python 🐍 tests

on: [push]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
pip install .
- name: Test with pytest
run: |
pytest
3 changes: 0 additions & 3 deletions README.md
Expand Up @@ -30,8 +30,6 @@ print(data["Adresse"][0])
### Bundesanzeiger
Get financial reports for all german companies that are reporting to Bundesanzeiger.

*Please note: For now [Google Chrome](https://www.google.com/chrome/) is required in order to use the Bundesanzeiger API. Please feel free to add [support for other browsers](https://github.com/SergeyPirogov/webdriver_manager).*

```python
from deutschland import Bundesanzeiger
ba = Bundesanzeiger()
Expand All @@ -42,4 +40,3 @@ print(data.keys())
# dict_keys(['Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020', 'Konzernabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020\nErgänzung der Veröffentlichung vom 04.06.2021',
```
*Big thanks to Nico Duldhardt and Friedrich Schöne, who [supported this implementation with their machine learning model](https://av.tib.eu/media/52366).*

240 changes: 129 additions & 111 deletions deutschland/bundesanzeiger/bundesanzeiger.py
Expand Up @@ -2,141 +2,159 @@
import requests

import dateparser
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

from bs4 import BeautifulSoup


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class Report:
__slots__ = ["date", "name", "content_url", "company", "report"]

def __init__(self, date, name, content_url, company, report=None):
self.date = date
self.name = name
self.content_url = content_url
self.company = company
self.report = report

from deutschland.bundesanzeiger.model import (
load_image_arr,
load_model,
prediction_to_str,
)
from webdriver_manager.chrome import ChromeDriverManager
def to_dict(self):
return {
"date": self.date,
"name": self.name,
"company": self.company,
"report": self.report,
}


class Bundesanzeiger:
requests_settings = {
"headers": {
"Connection": "keep-alive",
"sec-ch-ua": '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
"sec-ch-ua-mobile": "?0",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
"Referer": "https://www.bundesanzeiger.de/",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
},
"timeout": 1,
}

def __init__(self):
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--single-process")
options.add_argument("--disable-dev-shm-usage")

self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
self.model = load_model()

def __solve_captcha(self, image):
image = BytesIO(image)
image_arr = load_image_arr(image)
__slots__ = ["session", "model", "captcha_callback"]

def __init__(self, on_captach_callback=None):
self.session = requests.Session()
if on_captach_callback:
self.callback = on_captach_callback
else:
import deutschland.bundesanzeiger.model

self.model = deutschland.bundesanzeiger.model.load_model()
self.captcha_callback = self.__solve_captcha

def __solve_captcha(self, image_data: bytes):
import deutschland.bundesanzeiger.model

image = BytesIO(image_data)
image_arr = deutschland.bundesanzeiger.model.load_image_arr(image)
image_arr = image_arr.reshape((1, 50, 250, 1))

prediction = self.model.predict(image_arr)[0]
prediction_str = prediction_to_str(prediction)
prediction_str = deutschland.bundesanzeiger.model.prediction_to_str(prediction)

return prediction_str

def __get_report(self, itm):
report = {
"date": dateparser.parse(
itm.find_element(By.CLASS_NAME, "date").text, languages=["de"]
),
"name": itm.find_element(By.CLASS_NAME, "info")
.find_element(By.XPATH, "a[1]")
.text,
"company": itm.find_element(By.CLASS_NAME, "first").text,
"report": None,
}
def __is_captcha_needed(self, entry_content: str):
soup = BeautifulSoup(entry_content, "html.parser")
return not bool(soup.find("div", {"class": "publication_container"}))

def __find_all_entries_on_page(self, page_content: str):
soup = BeautifulSoup(page_content, "html.parser")
wrapper = soup.find("div", {"class": "result_container"})
rows = wrapper.find_all("div", {"class": "row"})
for row in rows:
info_element = row.find("div", {"class": "info"})
if not info_element:
continue

itm.find_element(By.CLASS_NAME, "info").find_element(By.XPATH, "a[1]").click()
try:
image_src = self.driver.find_element(
By.CLASS_NAME, "captcha_wrapper"
).get_attribute("innerHTML")
bs_src = BeautifulSoup(image_src, "html.parser")
captcha_img_link = bs_src.find(lambda tag: tag.name == "img")["src"]
cookies = {}
for cookie in self.driver.get_cookies():
cookies[cookie["name"]] = cookie["value"]
captcha_img_resp = requests.get(
captcha_img_link, cookies=cookies, **self.requests_settings
)
captcha = self.__solve_captcha(captcha_img_resp.content)
# send captcha
captcha_field = self.driver.find_element(By.XPATH, '//*[@name="solution"]')
captcha_field.send_keys(captcha)
captcha_field.send_keys(Keys.ENTER)
except NoSuchElementException:
pass

try:
# extract html
report["report"] = self.driver.find_element(
By.CLASS_NAME, "publication_container"
).get_attribute("innerHTML")
except Exception:
return None
self.driver.find_element(
By.XPATH,
'//*[@id="content"]/section/div/div/div/div/div[1]/div[2]/div/div[1]/a',
).click()
return report

def __iterate_trough_search_results(self, found_reports=None):
link_element = info_element.find("a")
if not link_element:
continue

entry_link = link_element.get("href")
entry_name = link_element.contents[0].strip()

date_element = row.find("div", {"class": "date"})
if not date_element:
continue

date = dateparser.parse(date_element.contents[0], languages=["de"])

company_name_element = row.find("div", {"class": "first"})
if not date_element:
continue

company_name = company_name_element.contents[0].strip()

yield Report(date, entry_name, entry_link, company_name)

def __generate_result(self, content: str):
"""iterate trough all results and try to fetch single reports"""
if found_reports is None:
found_reports = {}
for itm in self.driver.find_elements(
By.XPATH, '//*[@id="content"]/section[2]/div/div/div/div/div[6]/div'
):
document_name = None
try:
document_name = itm.find_element(By.CLASS_NAME, "info").text
except NoSuchElementException:
result = {}
for element in self.__find_all_entries_on_page(content):
get_element_response = self.session.get(element.content_url)

if self.__is_captcha_needed(get_element_response.text):
soup = BeautifulSoup(get_element_response.text, "html.parser")
captcha_image_src = soup.find("div", {"class": "captcha_wrapper"}).find(
"img"
)["src"]
img_response = self.session.get(captcha_image_src)
captcha_result = self.captcha_callback(img_response.content)
captcha_endpoint_url = soup.find_all("form")[1]["action"]
get_element_response = self.session.post(
captcha_endpoint_url,
data={"solution": captcha_result, "confirm-button": "OK"},
)

content_soup = BeautifulSoup(get_element_response.text, "html.parser")
content_element = content_soup.find(
"div", {"class": "publication_container"}
)

if not content_element:
continue

if document_name and document_name not in found_reports:
report = self.__get_report(itm)
if report:
found_reports[document_name] = report
return self.__iterate_trough_search_results(found_reports)
return found_reports
element.report = content_element.text
result[element.name] = element.to_dict()

return result

def get_reports(self, company_name: str):
"""
fetch all reports for this company name
:param company_name:
:return: Dict of all reports
:return" : "Dict of all reports
"""
self.driver.get("https://www.bundesanzeiger.de/ebanzwww/wexsservlet")
elem = self.driver.find_element_by_id("cc_all")
elem.click()
elem = self.driver.find_element_by_id("id3")
elem.send_keys(company_name)
elem.send_keys(Keys.ENTER)
return self.__iterate_trough_search_results()
self.session.cookies["cc"] = "1628606977-805e172265bfdbde-10"
self.session.headers.update(
{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.bundesanzeiger.de",
"Pragma": "no-cache",
"Referer": "https://www.bundesanzeiger.de/",
"sec-ch-ua-mobile": "?0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
}
)
# get the jsessionid cookie
response = self.session.get("https://www.bundesanzeiger.de")
# go to the start page
response = self.session.get("https://www.bundesanzeiger.de/pub/de/start?0")
# perform the search
response = self.session.get(
f"https://www.bundesanzeiger.de/pub/de/start?0-2.-top%7Econtent%7Epanel-left%7Ecard-form=&fulltext={company_name}&area_select=&search_button=Suchen"
)
return self.__generate_result(response.text)


if __name__ == "__main__":
ba = Bundesanzeiger()
reports = ba.get_reports("Deutsche Bahn AG")
print(reports.keys())
print(reports.keys(), len(reports))
3 changes: 1 addition & 2 deletions deutschland/bundesanzeiger/model.py
Expand Up @@ -5,8 +5,6 @@
import numpy as np
from PIL import Image

ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789")


def load_image_arr(fp):
image = Image.open(fp).convert("L")
Expand All @@ -17,6 +15,7 @@ def load_image_arr(fp):


def character_indexes_to_str(character_indexes):
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789")
characters = np.array(ALPHABET)[character_indexes]
return "".join(list(characters)).upper()

Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Expand Up @@ -15,12 +15,10 @@ mapbox-vector-tile = "^1.2.1"
requests = "^2.26.0"
dateparser = "^1.0.0"
gql = "^2.0.0"
selenium = "^3.141.0"
boto3 = "^1.18.9"
slugify = "^0.0.1"
tensorflow = "^2.5.0"
Pillow = "^8.3.1"
webdriver-manager = "^3.4.2"
beautifulsoup4 = "^4.9.3"

[tool.poetry.urls]
Expand Down
7 changes: 7 additions & 0 deletions tests/integration_test.py
@@ -0,0 +1,7 @@
from deutschland import Bundesanzeiger


def test_for_no_data_deutsche_bahn_ag():
ba = Bundesanzeiger()
data = ba.get_reports("Deutsche Bahn AG")
assert len(data.keys()) > 0, "Found no reports for Deutsche Bahn AG"