In [24]:
import requests

base_url = 'https://api.dane.gov.pl/1.4'
params = {
    'page': 1,
    'per_page': 100,
    'sort': 'id'
}

all_resources = []
while True:
    response = requests.get(base_url + '/resources', params=params)
    if response.status_code == 200:
        try:
            data = response.json()
            resources = data.get('data')
            for resource in resources:
                resource_attributes = resource.get('attributes', {})
                resource_relationships = resource.get('relationships', {})
                all_resources.append(
                    {
                        'id': resource.get('id'),
                        'title': resource_attributes.get('title'),
                        'format': resource_attributes.get('format'),
                        'openess': resource_attributes.get('openness_score'),
                        'file_url': resource_attributes.get('file_url'),
                        'tabular_data_available': 1 if resource_relationships.get('tabular_data') else 0,
                        'file_size': resource_attributes.get('file_size'),
                        'description': resource_attributes.get('description'),
                    }
                )
            if 'next' in data.get('links'):
                params['page'] += 1
            else:
                break
        except Exception as e:
            print(f'Error: {e}')
            break
    else:
        print(f'Failed to fetch data {params}. Status code: {response.status_code}')
        break

In [25]:
from collections import defaultdict

format_counter = defaultdict(int)
for resource in all_resources:
    format_counter[resource['format']] += 1

for format, count in format_counter.items():
    print(f"{format}: {count}")

pdf: 408
doc: 104
xls: 1044
xlsx: 1843
csv: 4774
docx: 8
zip: 249
xml: 131
html: 1316
rtf: 1
7z: 2
rdf: 5
png: 3
None: 46
json: 35
php: 21
txt: 5
rar: 2
odt: 3


In [26]:
openess_counter = defaultdict(int)
for resource in all_resources:
    openess_counter[resource['openess']] += 1

for openess, count in openess_counter.items():
    print(f"{openess}: {count}")

1: 619
2: 2891
3: 6342
4: 101
0: 47


In [29]:
tab_data_available_counter = defaultdict(int)
for resource in all_resources:
    tab_data_available_counter[resource['tabular_data_available']] += 1

total = sum(tab_data_available_counter.values())
for item, count in tab_data_available_counter.items():
    print(f"{item}: {count} ({count/total*100:.2f}%)")

0: 7815 (78.15%)
1: 2185 (21.85%)


In [28]:
for resource in all_resources[:10000]:
    if resource['tabular_data_available']:
        print(resource)

{'id': '29', 'title': 'Liczba skarg złożonych do Urzędu Morskiego w Szczecinie jako organu odwoławczego', 'format': 'xlsx', 'openess': 2, 'file_url': 'https://api.dane.gov.pl/media/resources/20171212/LiczbaskargzlozonychdoUMSjakoorganuodwolawczego.xlsx', 'tabular_data_available': 1, 'file_size': 10568, 'description': '<p>Liczba skarg złożonych do Urzędu Morskiego w Szczecinie jako organu odwoławczego</p>\n'}
{'id': '59', 'title': 'Ubezpieczeni (osoby fizyczne) w ubezpieczeniu chorobowym według województw IV kwartał 2015', 'format': 'xls', 'openess': 2, 'file_url': 'https://api.dane.gov.pl/media/resources/20160927/fa9a6cb4-e3e5-438a-9058-f9980a569015.xls', 'tabular_data_available': 1, 'file_size': 67072, 'description': '<p></p>\n'}
{'id': '159', 'title': 'Inwestycje sportowe FRKF 2014 r.', 'format': 'csv', 'openess': 3, 'file_url': 'https://api.dane.gov.pl/media/resources/20150703/FRKF-inwestycje-2014.csv', 'tabular_data_available': 1, 'file_size': 60251, 'description': '<p>Inwestycje s