Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration of VERENA-scraper & remove misplaced comment in Lebensmittelwarnung #29

Merged
merged 3 commits into from
Oct 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/runtests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ jobs:
pip install .
- name: Test with pytest
run: |
pytest
pytest
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,21 @@ print(data)
# [{'id': 19601, 'guid': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/detail/lebensmittel/19601', 'pubDate': 'Fri, 10 Feb 2017 12:28:45 +0000', 'imgSrc': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/opensaga/attachment/979f8cd3-969e-4a6c-9a8e-4bdd61586cd4/data.jpg', 'title': 'Sidroga Bio Säuglings- und Kindertee', 'manufacturer': 'Lebensmittel', 'warning': 'Pyrrolizidinalkaloide', 'affectedStates': ['Baden-Württemberg', '...']}]
```

## Federal Job Openings

### NRW

#### VERENA
Get open substitute teaching positions in NRW from https://www.schulministerium.nrw.de/BiPo/Verena/angebote
```python
from deutschland import Verena
v = Verena()
data = v.get()
print(data)
# a full example data can be found at deutschland/verena/example.md
# [{ "school_id": "99999", "desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen", "replacement_job_title": "Lehrkraft", "subjects": [ "Fach 1", "Fach 2" ], "comments": "Bemerkung zur Stelle: Testbemerkung", "duration": "01.01.2021 - 01.01.2022", ...} ...]


## Autobahn

Get data from the Autobahn.
Expand Down
1 change: 1 addition & 0 deletions deutschland/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
from .bundesanzeiger.bundesanzeiger import Bundesanzeiger
from .handelsregister.handelsregister import Handelsregister
from .lebensmittelwarnung.lebensmittelwarnung import Lebensmittelwarnung
from .verena.verena import Verena
from .bundesnetzagentur import *
1 change: 0 additions & 1 deletion deutschland/lebensmittelwarnung/lebensmittelwarnung.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,5 @@ def get(

if __name__ == "__main__":
lw = Lebensmittelwarnung()
# res = hr.search(keywords="Deutsche Bahn Aktiengesellschaft", keyword_match_option=3)
res = lw.get()
print(res)
Empty file added deutschland/verena/__init__.py
Empty file.
34 changes: 34 additions & 0 deletions deutschland/verena/example.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
### Scraper for https://www.schulministerium.nrw.de/BiPo/Verena/online

```json
{
"school_id": "99999",
"desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen",
"replacement_job_title": "Lehrkraft",
"subjects": [
"Fach 1",
"Fach 2"
],
"replacement_job_type_raw": "Vertretung für",
"replacement_job_type": "Vertretung",
"comments": "Bemerkung zur Stelle: Testbemerkung",
"duration": "01.01.2021 - 01.01.2022",
"hours_per_week": "13,5",
"contact": {
"phone": "0172 1111 1111",
"fax": "0172 2222 2222",
"homepage": "http://www.eine-schule.de",
"mail": {
"raw": "mailto:bewerbung@eineschule.de?subject=Stellenausschreibung in VERENA",
"adress": "bewerbung@eineschule.de",
"subject": "Stellenausschreibung in VERENA"
}
},
"deadline": "17.09.2021",
"geolocation": {
"coord_system": "epsg:25832",
"coordinates": [1111111, 1111111],
"post_adress": "Eine Stra\u00dfe 1\n99999 Schulingen"
}
}
```
28 changes: 28 additions & 0 deletions deutschland/verena/verena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from deutschland.verena.verenadownloader import VerenaDownloader
from deutschland.verena.verenaextractor import VerenaExtractor
import json


class Verena:
"""
Downloads and extracts the current job listings from the VERENA portal.
"""

def get(self):
"""
Downloads and extracts the current job listings from the VERENA portal.

Example of the json format can be found at ./example.json
"""
result = []
scraped_pages = VerenaDownloader().scrape()
for idx, page in enumerate(scraped_pages):
extract = VerenaExtractor(page).extract()
result = result + extract
return result


if __name__ == "__main__":
v = Verena()
res = v.get()
print(json.dumps(res))
142 changes: 142 additions & 0 deletions deutschland/verena/verenadownloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import requests
import math
from bs4 import BeautifulSoup
from typing import Tuple, List


class VerenaDownloader:
"""
Downloads all pages (each containing 100 job offerings) of the VERENA portal.
"""

BASE_URL = "https://www.schulministerium.nrw.de"

def __init__(self):
self.session = requests.Session()

def __scrape_landing_page(self) -> Tuple[int, str, str]:
"""Returns (job_openings_count: int, access_listing_url_part: str, access_listing_action_id: str)

Example: (513, "/BiPo/Verena/angebote?action=595.1764087184088", "595.1764087184088")

Scrapes the VERENA landing page to get a session cookie, matching actionid
to access the listing view and the count of job offerings in the listing.
"""
landing_url = self.BASE_URL + "/BiPo/Verena"
landing_request = self.session.get(landing_url)
landing_soup = BeautifulSoup(landing_request.text, "html.parser")
links = landing_soup.findAll("a", {"title": "Zu den Stellenausschreibungen"})
for link in links:
if "Derzeit im Netz veröffentlichte Ausschreibungen:" in link.text:
job_openings_count = link.find_next("strong").text
access_listing_url_part = link["href"]
# split action_id from listing_url_part
access_listing_action_id = access_listing_url_part.replace(
"/BiPo/Verena/angebote?action=", ""
)
return (
int(job_openings_count),
access_listing_url_part,
access_listing_action_id,
)

def __scrape_listing_page_initial(
self, access_listing_url_part: str
) -> Tuple[str, str, str]:
"""Returns (listing url with new actionid, blocksize 100 & valid suchid (aka. select_blocksize_url_part)), search_id, select_blocksize_action_id)

Example: ("/BiPo/Verena/angebote?action=509.9848906326322&block=b100&suchid=188736", "188736", "509.9848906326322")

Scrapes the VERENA listing page to get a listing url with blocksize = 100 and valid suchid (search_id).
suchid is generated by the backend and stores your search preferences.
"""
listing_url = self.BASE_URL + access_listing_url_part
listing_request = self.session.get(listing_url)
listing_soup = BeautifulSoup(listing_request.text, "html.parser")
blocksize_selector = listing_soup.find("div", id="blockauswahl")
# -1 is blocksize 100, also gets a such_id (search_id)
select_blocksize_url_part = blocksize_selector.findAll("a")[-1]["href"]
search_id = select_blocksize_url_part.split("=")[-1]
select_blocksize_action_id = select_blocksize_url_part.replace(
"/BiPo/Verena/angebote?action=", ""
).split("&")[0]
return select_blocksize_url_part, search_id, select_blocksize_action_id

def __set_block_size(self, select_blocksize_url_part: str):
"""
Run GET on search ID url to set correct block size for future requests in backend
"""
searchid_url = self.BASE_URL + select_blocksize_url_part
self.session.get(searchid_url)

def __generate_all_listing_urls(
self, action_id: str, search_id: str, opening_count: int
) -> List[str]:
"""Based on action_id, search_id and opening_count, generates a list of all listing urls.

Example: [
"https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a1&suchid=188265",
"https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a2&suchid=188265"
...
]
"""
all_urls = []
# because block size = 100
site_count = math.ceil(opening_count / 100)
for curr_site in range(0, site_count):
curr_site += 1
listing_format_string = (
self.BASE_URL + "/BiPo/Verena/angebote?action={0}&seite=a{1}&suchid={2}"
)
all_urls.append(
listing_format_string.format(action_id, curr_site, search_id)
)
return all_urls

def __scrape_actual_listing(self, urls: List[str]):
"""Downloads the job listing pages provided by 'urls' and returns their content as an list of sourcecodes.

Example: [
<html>...</html>
<html>...</html>
]

"""
scraped_pages = []
for url in urls:
r = self.session.get(url)
scraped_pages.append(r.text)
return scraped_pages

def scrape(self) -> List[str]:
"""Returns list of sourcecodes of all listing pages of the VERENA job listing portal.

Example: [
<html>...</html>
<html>...</html>
]

"""
(
job_opening_count,
access_listing_url_part,
access_listing_action_id,
) = self.__scrape_landing_page()
# select_blocksize_action_id is the action_id used to select the blocksize.
# Its also reused to query the different pages of the job portal.
(
select_blocksize_url_part,
search_id,
select_blocksize_action_id,
) = self.__scrape_listing_page_initial(access_listing_url_part)
self.__set_block_size(select_blocksize_url_part)
all_listing_urls = self.__generate_all_listing_urls(
select_blocksize_action_id, search_id, job_opening_count
)
return self.__scrape_actual_listing(all_listing_urls)


if __name__ == "__main__":
vd = VerenaDownloader()
res = vd.scrape()
print(res)
Loading