Skip to content

Commit

Permalink
memorious crawlers for opensanctions.org
Browse files Browse the repository at this point in the history
  • Loading branch information
sunu committed Apr 11, 2018
1 parent ea0bc13 commit 9dcbaa8
Show file tree
Hide file tree
Showing 45 changed files with 2,296 additions and 1 deletion.
66 changes: 65 additions & 1 deletion .gitignore
@@ -1,2 +1,66 @@
_site/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
.DS_Store
*.sqlite
.env
.vscode

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/

data/
env*/

_site/
2 changes: 2 additions & 0 deletions MANIFEST.in
@@ -0,0 +1,2 @@
include README.md
recursive-include opensanctions/configs/
Empty file added Makefile
Empty file.
7 changes: 7 additions & 0 deletions opensanctions/__init__.py
@@ -0,0 +1,7 @@
import os
from memorious.core import manager


def init():
config_path = os.path.join(os.path.dirname(__file__), 'config')
manager.load_path(config_path)
24 changes: 24 additions & 0 deletions opensanctions/config/au_dfat_sanctions.yml
@@ -0,0 +1,24 @@
name: au_dfat_sanctions
description: "[Opensanctions] Australian Department of Foreign Affairs and Trade Sanctions List"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'http://dfat.gov.au/international-relations/security/sanctions/Documents/regulation8_consolidated.xls'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.au_dfat_sanctions:parse
handle:
pass: parse_entry
parse_entry:
method: opensanctions.crawlers.au_dfat_sanctions:parse_entry
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
20 changes: 20 additions & 0 deletions opensanctions/config/ch_seco_sanctions.yml
@@ -0,0 +1,20 @@
name: ch_seco_sanctions
description: "[Opensanctions] Swiss Secretariat for Economic Affairs (SECO) Sanctions/Embargoes"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'https://www.sesam.search.admin.ch/sesam-search-web/pages/downloadXmlGesamtliste.xhtml?lang=en&action=downloadXmlGesamtlisteAction'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.ch_seco_sanctions:seco_parse
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
12 changes: 12 additions & 0 deletions opensanctions/config/coe_assembly.yml
@@ -0,0 +1,12 @@
name: coe_assembly
description: "[Opensanctions] Council of Europe Parliamentary Assembly"
schedule: weekly
pipeline:
init:
method: opensanctions.crawlers.coe_assembly:parse
params:
url: 'http://www.assembly.coe.int/nw/xml/AssemblyList/MP-Alpha-EN.asp?initial=%s&offset=0'
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
20 changes: 20 additions & 0 deletions opensanctions/config/eu_eeas_sanctions.yml
@@ -0,0 +1,20 @@
name: eu_eeas_sanctions
description: "[Opensanctions] EEAS Entities Subject to EU Financial Sanctions"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'http://ec.europa.eu/external_relations/cfsp/sanctions/list/version4/global/global.xml'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.eu_eeas_sanctions:eeas_parse
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
20 changes: 20 additions & 0 deletions opensanctions/config/eu_meps.yml
@@ -0,0 +1,20 @@
name: eu_meps
description: "[Opensanctions] Members of the European Parliament"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'http://www.europarl.europa.eu/meps/en/xml.html?query=full&filter=all'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.eu_meps:parse
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
10 changes: 10 additions & 0 deletions opensanctions/config/eu_whoiswho.yml
@@ -0,0 +1,10 @@
name: eu_whoiswho
description: "[Opensanctions] EU Who is Who Staff Directory"
schedule: weekly
pipeline:
init:
method: opensanctions.crawlers.eu_whoiswho:scrape
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
28 changes: 28 additions & 0 deletions opensanctions/config/everypolitician.yml
@@ -0,0 +1,28 @@
name: everypolitician
description: "[Opensanctions] Data from EveryPolitician.org"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/countries.json'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: scrape
scrape:
method: opensanctions.crawlers.everypolitician:scrape
handle:
pass: scrape_csv
scrape_csv:
method: opensanctions.crawlers.everypolitician:scrape_csv
handle:
pass: scrape_entity
scrape_entity:
method: opensanctions.crawlers.everypolitician:scrape_entity
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
28 changes: 28 additions & 0 deletions opensanctions/config/gb_coh_disqualified.yml
@@ -0,0 +1,28 @@
name: gb_coh_disqualified
description: "[Opensanctions] Disqualified company directors from the UK (Companies' House)"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'https://beta.companieshouse.gov.uk/register-of-disqualifications/A'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: crawl_alphabetical
crawl_alphabetical:
method: opensanctions.crawlers.gb_coh_disqualified:crawl_alphabetical
handle:
pass: crawl_pages
crawl_pages:
method: opensanctions.crawlers.gb_coh_disqualified:crawl_pages
handle:
pass: crawl_officer
crawl_officer:
method: opensanctions.crawlers.gb_coh_disqualified:crawl_officer
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
24 changes: 24 additions & 0 deletions opensanctions/config/gb_hmt_sanctions.yml
@@ -0,0 +1,24 @@
name: gb_hmt_sanctions
description: "[Opensanctions] UK HM Treasury sanctions list"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'http://hmt-sanctions.s3.amazonaws.com/sanctionsconlist.csv'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.gb_hmt_sanctions:parse
handle:
pass: parse_entry
parse_entry:
method: opensanctions.crawlers.gb_hmt_sanctions:parse_entry
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
15 changes: 15 additions & 0 deletions opensanctions/config/interpol_red_notices.yml
@@ -0,0 +1,15 @@
name: interpol_red_notices
description: "[Opensanctions] International search warrants issued by Interpol"
schedule: weekly
pipeline:
init:
method: opensanctions.crawlers.interpol_red_notices:scrape
params:
url: 'http://www.interpol.int/notice/search/wanted/(offset)/%s'
handle:
pass: scrape_case
scrape_case:
method: opensanctions.crawlers.interpol_red_notices:scrape_case
pass: store
store:
method: opensanctions.helpers:store_entity
21 changes: 21 additions & 0 deletions opensanctions/config/kg_fiu_national.yml
@@ -0,0 +1,21 @@
name: kg_fiu_national
description: "[Opensanctions] Kyrgyz Financial Intelligence Unit National List"
schedule: weekly
pipeline:
init:
method: seed
params:
# url: 'https://fiu.gov.kg/sked/1'
url: 'https://fiu.gov.kg/uploads/59c1fe8b4aae0.xml'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.kg_fiu_national:parse
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
22 changes: 22 additions & 0 deletions opensanctions/config/ua_sdfm_blacklist.yml
@@ -0,0 +1,22 @@
name: ua_sdfm_blacklist
description: "[Opensanctions] Ukraine State Finance Monitoring Service"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'http://www.sdfm.gov.ua/content/file/Site_docs/Black_list/zBlackListFull.xml'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.ua_sdfm_blacklist:parse
params:
url: 'http://www.sdfm.gov.ua/content/file/Site_docs/Black_list/zBlackListFull.xml'
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
20 changes: 20 additions & 0 deletions opensanctions/config/un_sc_sanctions.yml
@@ -0,0 +1,20 @@
name: un_sc_sanctions
description: "[Opensanctions] United Nations Security Council sanctions"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'https://scsanctions.un.org/resources/xml/en/consolidated.xml'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.un_sc_sanctions:parse
handle:
pass: store
store:
method: opensanctions.helpers:store_entity
24 changes: 24 additions & 0 deletions opensanctions/config/us_bis_denied.yml
@@ -0,0 +1,24 @@
name: us_bis_denied
description: "[Opensanctions] US Bureau of Industry and Security - Denied Persons List"
schedule: weekly
pipeline:
init:
method: seed
params:
url: 'https://www.bis.doc.gov/dpl/dpl.txt'
handle:
pass: fetch
fetch:
method: fetch
handle:
pass: parse
parse:
method: opensanctions.crawlers.us_bis_denied:parse
handle:
pass: parse_row
parse_row:
method: opensanctions.crawlers.us_bis_denied:parse_row
handle:
pass: store
store:
method: opensanctions.helpers:store_entity

0 comments on commit 9dcbaa8

Please sign in to comment.