Skip to content

Commit

Permalink
update to format and test workflow
Browse files Browse the repository at this point in the history
* fixed linting problem
* fixed wrong py version
  • Loading branch information
K0IN committed Aug 11, 2021
1 parent 8626e3a commit 0e3c280
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/runtests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
python-version: [3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand Down
86 changes: 46 additions & 40 deletions deutschland/bundesanzeiger/bundesanzeiger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


class Report:
__slots__ = ['date', 'name', 'content_url', 'company', 'report']
__slots__ = ["date", "name", "content_url", "company", "report"]

def __init__(self, date, name, content_url, company, report=None):
self.date = date
Expand All @@ -18,43 +18,44 @@ def __init__(self, date, name, content_url, company, report=None):

def to_dict(self):
return {
'date': self.date,
'name': self.name,
'company': self.company,
'report': self.report,
"date": self.date,
"name": self.name,
"company": self.company,
"report": self.report,
}


class Bundesanzeiger:
__slots__ = ['session', 'model', 'captcha_callback']
__slots__ = ["session", "model", "captcha_callback"]

def __init__(self, on_captach_callback=None):
self.session = requests.Session()
if on_captach_callback:
self.callback = on_captach_callback
else:
import deutschland.bundesanzeiger.model

self.model = deutschland.bundesanzeiger.model.load_model()
self.captcha_callback = self.__solve_captcha

def __solve_captcha(self, image_data: bytes):
import deutschland.bundesanzeiger.model

image = BytesIO(image_data)
image_arr = deutschland.bundesanzeiger.model.load_image_arr(image)
image_arr = image_arr.reshape((1, 50, 250, 1))

prediction = self.model.predict(image_arr)[0]
prediction_str = deutschland.bundesanzeiger.model.prediction_to_str(
prediction)
prediction_str = deutschland.bundesanzeiger.model.prediction_to_str(prediction)

return prediction_str

def __is_captcha_needed(self, entry_content: str):
soup = BeautifulSoup(entry_content, 'html.parser')
soup = BeautifulSoup(entry_content, "html.parser")
return not bool(soup.find("div", {"class": "publication_container"}))

def __find_all_entries_on_page(self, page_content: str):
soup = BeautifulSoup(page_content, 'html.parser')
soup = BeautifulSoup(page_content, "html.parser")
wrapper = soup.find("div", {"class": "result_container"})
rows = wrapper.find_all("div", {"class": "row"})
for row in rows:
Expand Down Expand Up @@ -90,20 +91,23 @@ def __generate_result(self, content: str):
get_element_response = self.session.get(element.content_url)

if self.__is_captcha_needed(get_element_response.text):
soup = BeautifulSoup(get_element_response.text, 'html.parser')
captcha_image_src = soup.find(
"div", {"class": "captcha_wrapper"}).find("img")["src"]
soup = BeautifulSoup(get_element_response.text, "html.parser")
captcha_image_src = soup.find("div", {"class": "captcha_wrapper"}).find(
"img"
)["src"]
img_response = self.session.get(captcha_image_src)
captcha_result = self.captcha_callback(img_response.content)
captcha_endpoint_url = soup.find_all("form")[1]["action"]
get_element_response = self.session.post(captcha_endpoint_url, data={
"solution": captcha_result, "confirm-button": "OK"})
get_element_response = self.session.post(
captcha_endpoint_url,
data={"solution": captcha_result, "confirm-button": "OK"},
)

content_soup = BeautifulSoup(
get_element_response.text, 'html.parser')
content_soup = BeautifulSoup(get_element_response.text, "html.parser")
content_element = content_soup.find(
"div", {"class": "publication_container"})

"div", {"class": "publication_container"}
)

if not content_element:
continue

Expand All @@ -119,32 +123,34 @@ def get_reports(self, company_name: str):
:return" : "Dict of all reports
"""
self.session.cookies["cc"] = "1628606977-805e172265bfdbde-10"
self.session.headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.bundesanzeiger.de",
"Pragma": "no-cache",
"Referer": "https://www.bundesanzeiger.de/",
"sec-ch-ua-mobile": "?0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
})
self.session.headers.update(
{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.bundesanzeiger.de",
"Pragma": "no-cache",
"Referer": "https://www.bundesanzeiger.de/",
"sec-ch-ua-mobile": "?0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
}
)
# get the jsessionid cookie
response = self.session.get("https://www.bundesanzeiger.de")
# go to the start page
response = self.session.get(
"https://www.bundesanzeiger.de/pub/de/start?0")
response = self.session.get("https://www.bundesanzeiger.de/pub/de/start?0")
# perform the search
response = self.session.get(
f"https://www.bundesanzeiger.de/pub/de/start?0-2.-top%7Econtent%7Epanel-left%7Ecard-form=&fulltext={company_name}&area_select=&search_button=Suchen")
f"https://www.bundesanzeiger.de/pub/de/start?0-2.-top%7Econtent%7Epanel-left%7Ecard-form=&fulltext={company_name}&area_select=&search_button=Suchen"
)
return self.__generate_result(response.text)


Expand Down
1 change: 1 addition & 0 deletions deutschland/bundesanzeiger/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
from PIL import Image


def load_image_arr(fp):
image = Image.open(fp).convert("L")
image = np.array(image)
Expand Down
2 changes: 1 addition & 1 deletion tests/integration_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from deutschland import Bundesanzeiger


def test_for_no_data_deutsche_bahn_ag():
ba = Bundesanzeiger()
data = ba.get_reports("Deutsche Bahn AG")
assert len(data.keys()) > 0, "Found no reports for Deutsche Bahn AG"

0 comments on commit 0e3c280

Please sign in to comment.