In [None]:
import requests

base_url = 'https://api.dane.gov.pl/1.4'
params = {
    'page': 1,
    'per_page': 100,
    'sort': 'id'
}

all_datasets = []
while True:
    response = requests.get(base_url + '/datasets', params=params)
    if response.status_code == 200:
        try:
            data = response.json()
            datasets = data.get('data')
            for dataset in datasets:
                attributes = dataset.get('attributes', {})
                all_datasets.append(
                    {
                        'id': dataset.get('id'),
                        'title': attributes.get('title'),
                        'formats': attributes.get('formats'),
                        'license_name': attributes.get('license_name'),
                        'type': dataset.get('type'),
                        'categories': attributes.get('categories'),
                        'category': attributes.get('category'),
                    }
                )
            if 'next' in data.get('links'):
                params['page'] += 1
            else:
                break
        except Exception as e:
            print(f'Error: {e}')
            break
    else:
        print(f'Failed to fetch data {params}. Status code: {response.status_code}')
        break

In [2]:
print(len(all_datasets))
print(params)
print(all_datasets[0])

3639
{'page': 37, 'per_page': 100, 'sort': 'id'}
{'id': '1', 'title': 'Dane liczbowe dot. kontroli prowadzonych przez WIIH w 2014 r.', 'formats': ['xlsx'], 'license_name': 'CC0 1.0', 'type': 'dataset'}


In [3]:
from collections import defaultdict

format_list_counter = defaultdict(int)
for dataset in all_datasets:
    formats = tuple(dataset['formats'])
    format_list_counter[formats] += 1

for format_list, count in format_list_counter.items():
    print(f"{list(format_list)}: {count}")

['xlsx']: 329
['pdf']: 91
['xls']: 80
['docx', 'html', 'json', 'pdf']: 2
[]: 93
['xls', 'xlsx']: 59
['csv', 'zip']: 6
['csv']: 484
['html']: 241
['csv', 'jsonld', 'pdf', 'xlsx']: 10
['csv', 'xml']: 4
['doc', 'html', 'xml']: 1
['pdf', 'xlsx']: 26
['csv', 'jsonld', 'xlsx']: 532
['csv', 'jsonld']: 296
['pdf', 'xls']: 9
['html', 'rdf']: 5
['zip']: 23
['xml']: 284
['html', 'xlsx']: 6
['csv', 'doc', 'jsonld', 'pdf', 'xls']: 1
['html', 'xml']: 51
['pdf', 'zip']: 10
['html', 'zip']: 7
['doc']: 12
['csv', 'pdf', 'xlsx', 'xml']: 1
['csv', 'jsonld', 'xls']: 23
['csv', 'pdf', 'xlsx']: 8
['json', 'xml']: 25
['csv', 'html', 'jsonld', 'xml']: 6
['docx', 'pdf']: 5
['csv', 'jsonld', 'xls', 'xlsx']: 45
['csv', 'xlsx']: 140
['html', 'xls']: 3
['csv', 'txt']: 3
['csv', 'jsonld', 'rar', 'xlsx']: 1
['pdf', 'png']: 1
['csv', 'docx', 'jsonld', 'pdf', 'xlsx']: 1
['json']: 30
['html', 'pdf', 'xls', 'zip']: 1
['html', 'jsonld']: 1
['7z', 'html', 'pdf', 'xls', 'xlsx']: 1
['html', 'pdf']: 3
['html', 'xls', 'xlsx']

In [4]:
# Initialize a set to store unique formats
unique_formats = set()

# Iterate over each format set and add each format to the set
for format_set in format_list_counter.keys():
    unique_formats.update(format_set)

# Print all unique formats
print(unique_formats)

{'docx', 'tsv', 'doc', 'png', 'wfs', 'odt', 'html', 'jpeg', 'xlsx', 'xml', 'rdf', 'xlsx?=', 'csv', 'php', 'shp', 'ods', 'xls', 'rtf', 'json', 'zip', 'fgb', 'txt', 'geojson', 'geotiff', 'wms', 'jsonld', 'wcs', '7z', 'rar', 'pdf'}


In [5]:
access_class_1 = ['json', 'geojson', 'jsonld', 'xml', 'html']
access_class_2 = ['csv', 'tsv', 'txt', 'rdf']
access_class_3 = ['xlsx', 'xls', 'ods', 'odt', 'rtf']
access_class_4 = ['pdf', 'doc', 'docx', 'php']
access_class_5 = ['jpeg', 'png', 'geotiff', 'wms', 'zip', '7z', 'rar', 'wcs', 'wfs', 'shp', 'fgb', 'xlsx?=']


def classify_dataset(formats):
    """Classify dataset based on formats - lowest class number wins"""
    if not formats:
        return 6
    
    # Check each class in order (1 to 5)
    if any(fmt in access_class_1 for fmt in formats):
        return 1
    elif any(fmt in access_class_2 for fmt in formats):
        return 2
    elif any(fmt in access_class_3 for fmt in formats):
        return 3
    elif any(fmt in access_class_4 for fmt in formats):
        return 4
    else:
        return 5

# Count datasets by class
class_counts = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6:0}

for dataset in all_datasets:
    dataset_class = classify_dataset(dataset['formats'])
    class_counts[dataset_class] += 1

print("Dataset counts by access class:")
for class_num in sorted(class_counts.keys()):
    print(f"Class {class_num}: {class_counts[class_num]} datasets")

total = sum(class_counts.values())
print(f"\nTotal datasets: {total}")

print("\nPercentages:")
for class_num in sorted(class_counts.keys()):
    percentage = (class_counts[class_num] / total) * 100
    print(f"Class {class_num}: {percentage:.1f}%")


Dataset counts by access class:
Class 1: 1778 datasets
Class 2: 1053 datasets
Class 3: 541 datasets
Class 4: 135 datasets
Class 5: 39 datasets
Class 6: 93 datasets

Total datasets: 3639

Percentages:
Class 1: 48.9%
Class 2: 28.9%
Class 3: 14.9%
Class 4: 3.7%
Class 5: 1.1%
Class 6: 2.6%


In [6]:
from collections import defaultdict

# Count license names
license_counter = defaultdict(int)

for dataset in all_datasets:
    license_name = dataset.get('license_name', 'Unknown')
    license_counter[license_name] += 1

# Print license counts
print("License name counts:")
for license_name, count in sorted(license_counter.items(), key=lambda x: x[1], reverse=True):
    print(f"'{license_name}': {count}")

print(f"\nTotal datasets: {sum(license_counter.values())}")
print(f"Unique licenses: {len(license_counter)}")

License name counts:
'CC0 1.0': 1981
'CC BY 4.0': 1593
'CC BY-NC-SA 4.0': 36
'CC BY-NC-ND 4.0': 16
'CC BY-SA 4.0': 12
'CC BY-NC 4.0': 1

Total datasets: 3639
Unique licenses: 6


In [7]:
type_counter = defaultdict(int)

for dataset in all_datasets:
    dataset_type = dataset.get('type', 'Unknown')
    type_counter[dataset_type] += 1

# Print type counts
print("Dataset type counts:")
for dataset_type, count in sorted(type_counter.items(), key=lambda x: x[1], reverse=True):
    print(f"'{dataset_type}': {count}")

print(f"\nTotal datasets: {sum(type_counter.values())}")
print(f"Unique types: {len(type_counter)}")

Dataset type counts:
'dataset': 3639

Total datasets: 3639
Unique types: 1
