## Cabeçalho

In [2]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import sys
import os, json, re, requests
from ckanapi import RemoteCKAN
import ckanapi.errors
from ckanapi.errors import NotFound, ValidationError
import pandas as pd
import random
import numpy as np

from pathlib import Path
import json

# from ckan_migration import Migrate

In [3]:
def download_packages(ORIGINAL_CKAN_URL,env):
    api_url = ORIGINAL_CKAN_URL + '/api/3/action/package_search?q=&rows=3000'
    packages = requests.get(api_url, verify=False).json()['result']['results']
    for p in packages:
        if not os.path.isdir(f"packages/"):
            os.mkdir(f'packages/')
        if not os.path.isdir(f"packages/{env}"):
            os.mkdir(f'packages/{env}')
        name= p['name']
        json.dump(p, open(f'packages/{env}/{name}', 'w'))
    return packages

In [85]:

import inflection

def missing_dataset_id(resource):
	#print(resource['dataset_id'])
	return resource['dataset_id'] is None

def list_resources_with_no_dataset_id(packages):

	l_bdm = []
	l_rest = []

	for package in packages:
		for resource in package['resources']:
			if 'dataset_id' not in resource:
				if resource['resource_type'] == 'bdm_table':
					l_bdm.append({package['name']:resource['name']})
				else:
					l_rest.append({package['name']:resource['name']})
	
	return l_bdm, l_rest

def list_tables_with_no_columns(packages):

	l_bdm = []

	for package in packages:
		for resource in [resource for resource in package['resources'] if resource['resource_type'] == 'bdm_table']:
			if 'columns' not in resource:
				l_bdm.append({package['name']:resource['name']})
	
	return l_bdm

def list_external_links_with_missing_fields(packages):

	l = []

	for package in packages:
		
		for resource in package['resources']:
			
			if resource['resource_type'] in ['external_link']:
				
				if ('url' not in resource):

					l.append({package['name']:resource['name']})
					
	return l

def list_columns_with_missing_fields(packages):

	l = []

	for package in packages:
		
		for resource in package['resources']:
			
			if resource['resource_type'] in ['bdm_table', 'bdm_dictionary']:
				
				if 'columns' not in resource:
					print('{}:{}'.format(package['name'],resource['name']))
				#assert 'columns' in resource
					#print(package['name'], resource['name'])
					#break
				else:
					for column in resource['columns']:
						
						if 'name' not in column:
							l.append(package['name'])

						elif ('bigquery_type' not in column or
							'description' not in column or
							'has_sensitive_data' not in column or
							'covered_by_dictionary' not in column):

							l.append({package['name']:'{}_{}'.format(resource['name'],column['name'])})
					
	return l

def replace_missing_dataset_ids(package):

	dataset_id = inflection.underscore(package['name'])

	for resource in package['resources']:
		
		if 'dataset_id' not in resource or resource['dataset_id'] != dataset_id:
			resource['dataset_id'] = dataset_id
		
	return package

def create_missing_entity_fields(package):

	for i, resource in enumerate(package['resources']):
		
		if 'entity' not in resource:
			package['resources'][i]['entity'] = []
	
	return package

def create_short_description_field(package):

	package['short_description'] = ''

	return package

def create_partner_organization_field(package):
	
	field = {'name': '', 'organization_id': ''}
	
	for i, resource in enumerate(package['resources']):
		if resource['resource_type'] in ['bdm_table', 'information_request']:
			resource['partner_organization'] = field
		package['resources'][i] = resource
	
	return package

def replace_missing_column_fields(package):

	substring_list = ['cpf', 'cnpj', 'nome', 'endereco']

	for resource in package['resources']:
		
		if resource['resource_type'] in ['bdm_table', 'bdm_dictionary']:
			
			assert 'columns' in resource
				#print(package['name'], resource['name'])
				#break

			for column in resource['columns']:
				
				if 'has_sensitive_data' not in column:
					column['has_sensitive_data'] = 'no'
				else:
					if column['has_sensitive_data'] is None:
						if any(substring in column['name'] for substring in substring_list):
							column['has_sensitive_data'] = 'no'
						else:
							column['has_sensitive_data'] = 'yes'
				
				if 'covered_by_dictionary' not in column or column['covered_by_dictionary'] is None:
					column['covered_by_dictionary'] = 'no'

	return package

def migrate_time_unit(package):
	
	for i, resource in enumerate(package['resources']):
		if 'time_unit' not in resource:
			pass
		elif resource['time_unit'] != '':
			
			# converts derived years to year
			if resource['time_unit'] in ['one_year', 'two_years', 'three_years', 'four_years', 'five_years', 'ten_years']:
				updated_time_unit = 'year'
			elif resource['time_unit'] in ['unique', 'recurring', 'uncertain', 'other']:
				updated_time_unit = None
			else:
				updated_time_unit = resource['time_unit']
			
			if 'entity' not in resource or resource['entity'] is None:
				resource['entity'] = updated_time_unit
			elif resource['entity'] is not None and isinstance(resource['entity'], list):
				#resource['entity'] = resource['entity'] + ', ' + resource['time_unit']
				resource['entity'].append(updated_time_unit)
		
		package['resources'][i] = resource

	return package

def migrate_data_cleaning_code_url(package):

	for i, resource in enumerate(package['resources']):
		if resource['resource_type'] == 'bdm_table':
			if 'code_url' in resource['data_cleaned_by']:
				resource['data_cleaning_code_url'] = resource['data_cleaned_by']['code_url']
			else:
				resource['data_cleaning_code_url'] = ''

			updated_field = {}
			for key, value in resource['data_cleaned_by'].items():
				if key != 'code_url':
					updated_field[key] = value
			resource['data_cleaned_by'] = updated_field

		package['resources'][i] = resource

	return package

def migrate_spatial_coverage(package):

	for k, resource in enumerate(package['resources']):
		
		if resource['name'] == 'dicionario':
			del resource['spatial_coverage']
		else:
			if 'spatial_coverage' in resource:
				if resource['spatial_coverage'] == 'all':
					resource['spatial_coverage'] = {'continent': ['all'],
													'country': None,
													'admin1': None,
													'admin2': None}
				elif resource['spatial_coverage'] == 'bra':
					resource['spatial_coverage'] = {'continent': ['south_america'],
													'country': ['bra'],
													'admin1': None,
													'admin2': None}
				else:
					pass

		package['resources'][k] = resource
	
	return package

def subLists(l):
    lists = [[]]
    for i in range(len(l) + 1):
        for j in range(i):
            lists.append(l[j: i])
    return lists

def stepsList(l):

	steps = [l[n]-l[n-1] for n in range(1,len(l))]
	diff = list(set(steps))

	return diff

def migrate_temporal_coverage_field(l):

	if str in [type(element) for element in l]:
		return l
	
	elif len(l) == 1:
		return [str(element) for element in l]
	
	else:
		
		l = sorted(list(set(l))) # to avoid negative steps
		
		steps = [l[n]-l[n-1] for n in range(1,len(l))]
		diff = list(set(steps))
		
		step = min(diff)
		sublists = subLists(l)
		largest_sublists = []
		for element in l:
			sls = [sl for sl in sublists if element in sl and ((len(stepsList(sl)) == 1 and stepsList(sl)[0] == step) or len(sl) == 1)]
			max_sl = max(sls, key=len)
			largest_sublists.append(max_sl)

		lists = [list(item) for item in set(tuple(row) for row in largest_sublists)]

		intervals = []
		for l in lists:

			steps = [l[n]-l[n-1] for n in range(1,len(l))]
			diff = list(set(steps))

			if len(l) == 1:
				intervals.append([str(x) for x in l])
			else:
				if len(diff) == 1:
					first = l[0]
					last = l[-1]
					unit = diff[0]
					intervals.append(['{}({}){}'.format(first, unit, last)])
				else:
					lists = []
					previous_step = steps[0]
					sub_list = [l[0]]
					for i, step in enumerate(steps[1:]):
						if step == previous_step:
							sub_list.append(step)
						else:
							sub_list.append(l[i+1])
							lists.append(sub_list)
							sub_list = []
						previous_step = step
					intervals.append(lists)

		updated_temporal_coverage = sorted([item for sublist in intervals for item in sublist])

		return updated_temporal_coverage
	
def migrate_temporal_coverage(package):

	for i, resource in enumerate(package['resources']):

		if 'temporal_coverage' in resource and resource['temporal_coverage'] is not None and resource['temporal_coverage'] != []:
			try:
				resource['temporal_coverage'] = migrate_temporal_coverage_field(resource['temporal_coverage'])
			except:
				print(resource['name'], resource['temporal_coverage'])
		else:
			resource['temporal_coverage'] = []

		if resource['resource_type'] == 'bdm_table':
			for j, column in enumerate(resource['columns']):
				if 'temporal_coverage' in column and column['temporal_coverage'] is not None and column['temporal_coverage'] != []:
					column['temporal_coverage'] = migrate_temporal_coverage_field(column['temporal_coverage'])
				else:
					column['temporal_coverage'] = []
				
				resource['columns'][j] = column
		
		package['resources'][i] = resource
	
	return package

def list_resources_with_no_entity(packages):

	l = []

	for package in packages:
		for resource in package['resources']:
			if 'entity' not in resource or resource['entity'] == []:
				l.append({package['name']:resource['name']})
	
	return l

def fill_out_observation_level_from_identifying_columns(resource, sols):

	for identifying_column in resource['identifying_columns']:

		# find index of corresponding entity and country
		i = None
		for k, sol in enumerate(sols):
			if sol[2] in identifying_column:
				i = k
		
		# if identifying_column is in standard
		if i is not None:
			country = sols[i][0]
			entity = sols[i][1]

			# if corresponding entity already exists:
			if entity in [ol['entity'] for ol in resource['observation_level'] if 'entity' in ol]:
				
				# paste identifying_column in correct place
				for k, ol in enumerate(resource['observation_level']):
					if 'entity' in ol and ol['entity'] == entity:
						resource['observation_level'][k]['column'] = [identifying_column]
						if 'country' in resource['observation_level'][k]:
							resource['observation_level'][k]['country'] = country
			
			else:
				if country == '':
					dict = {'entity': entity, 'columns': [identifying_column]}
				else:
					dict = {'country': country, 'entity': entity, 'columns': [identifying_column]}
				resource['observation_level'].append(dict)
		
		else:
			dict = {'columns': [identifying_column]}
			resource['observation_level'].append(dict)

	return resource

def migrate_observation_level(package):

	for i, resource in enumerate(package['resources']):
		
		resource['observation_level'] = []

		if resource['name'] == 'dicionario': # `dicionario` has different metadata from `bdm_table`
			pass

		else:
			if 'entity' in resource and resource['entity'] not in [None, []]:
				
				for k, entity in enumerate(resource['entity']):

					# migrate entity
					resource['observation_level'].append({'entity': entity})

					# fill out country for common spatial entities
					if entity in ['municipality', 'district', 'census_tract']:
						resource['observation_level'][k]['country'] = 'bra'
					elif entity in ['county']:
						resource['observation_level'][k]['country'] = 'usa'

			if 'identifying_columns' in resource and resource['identifying_columns'] not in [None, []]:
				
				standard_observation_levels = [
					['', 'year', 'ano'],
					['', 'quarter', 'trimestre'],
					['', 'month', 'mes'],
					['', 'date', 'data'],
					['bra', 'state', 'sigla_uf'],
					['bra', 'state', 'id_uf'],
					['bra', 'municipality', 'id_municipio'],
					['bra', 'district', 'id_distrito'],
					['bra', 'census_tract', 'id_setor_censitario'],
				]

				resource = fill_out_observation_level_from_identifying_columns(resource, standard_observation_levels)
			
		package['resources'][i] = resource

	return package

def migrate_partitions_field(package):

	for k, resource in enumerate(package['resources']):
		if resource['resource_type'] == 'bdm_table':
			if 'partitions' in resource and resource['partitions'] not in [None, '']:
				if type(resource['partitions']) == list:
					pass
				else:
					resource['partitions'] = resource['partitions'].replace(",", " ")
					resource['partitions'] = resource['partitions'].split()
			else:
				resource['partitions'] = []

		package['resources'][k] = resource
		
	return package

def list_datasets(packages, condition):

	l = []

	for package in packages:
		for resource in package['resources']:
			if resource['resource_type'] == 'bdm_table':
				if condition(resource):
					l.append(package['name'])
					break
	
	return l

def delete_package_field(package, field):

	updated_package = {}

	for key, value in package.items():
		if key != field:
			updated_package[key] = value

	return updated_package

def delete_package_dataset_id(package):

	package['extras'][0]['value'] = {}

	return package

def delete_resource_field(package, resource_type, field):

	for i, resource in enumerate(package['resources']):
		if resource['resource_type'] == resource_type:
			updated_resource = {}
			for key, value in resource.items():
				if key != field:
					updated_resource[key] = value
			package['resources'][i] = updated_resource
	
	return package


## Download packages

In [5]:
LOCAL_CKAN_URL = 'http://localhost'
DEV_CKAN_URL = 'https://staging.basedosdados.org'
PROD_CKAN_URL = 'https://basedosdados.org'

local_packages = download_packages(LOCAL_CKAN_URL,'prod')
#dev_packages = download_packages(DEV_CKAN_URL,'dev')
#prod_packages = download_packages(PROD_CKAN_URL,'prod')

## Validation

In [None]:

packages = local_packages

#list_resources_with_no_dataset_id(packages)
#list_tables_with_no_columns(packages)
#list_external_links_with_missing_fields(packages)
list_resources_with_no_entity(packages) # necessary to make (bdm_table, external_link, information_request).observation_level
										# a required field
list_columns_with_missing_fields(packages)


## Migration

In [6]:
class Migrator:
    def __init__(self, ckan_remote: RemoteCKAN, package_dict):
        self.ckan_remote = ckan_remote
        self.package_dict = package_dict

    def create(self):
        try:
            self.ckan_remote.action.package_create(**self.package_dict)
        except NotFound as e:
            print(e)
    
    def update(self):
        try:
            self.ckan_remote.action.package_update(**self.package_dict)
        except NotFound as e:
            print(e)
            
    def purge(self):
        try:
            self.ckan_remote.action.dataset_purge(id=self.package_dict['name'])
        except NotFound as e:
            print(e)
   
    def delete(self):
        try:
            self.ckan_remote.action.package_delete(id=self.package_dict['name'])
        except NotFound as e:
            print(e)
	
    def validate(self):
        try:
            self.ckan_remote.action.bd_dataset_validate(**self.package_dict)
        except NotFound as e:
            print(e)  

In [None]:

import copy

ckan_remote = RemoteCKAN(
    "http://localhost",
    "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqdGkiOiJ3eTVMYThfeDllN0dFdmtodE5MaXNMR2FfN3Etc1lWeTFPeldWVjQ5QWZOMXdiQWgtVHdGY0F5cjJtYnpEb01ZbnZDYm5wZDNsRGdaWExuTiIsImlhdCI6MTY0Mjk4Mjk3MX0.k2iMMrC_DfRdDylqUDbXo7UOCeksazxginQSEReCoJg"
)

packages = copy.deepcopy(local_packages[47:48]) # 
updated_packages = []

for i, package in enumerate(packages):

	print(i, '-', package['name'])

	updated_package = packages[i]

	updated_package = delete_package_dataset_id(updated_package)
	updated_package = replace_missing_dataset_ids(updated_package)
	updated_package = create_missing_entity_fields(updated_package)
	updated_package = create_short_description_field(updated_package)
	updated_package = create_partner_organization_field(updated_package)
	
	updated_package = delete_package_field(updated_package, 'spatial_coverage')
	updated_package = delete_package_field(updated_package, 'temporal_coverage')
	updated_package = delete_package_field(updated_package, 'update_frequency')
	updated_package = delete_package_field(updated_package, 'entity')
	updated_package = delete_package_field(updated_package, 'time_unit')
	updated_package = delete_package_field(updated_package, 'download_type')
	
	updated_package = migrate_spatial_coverage(updated_package)
	updated_package = migrate_temporal_coverage(updated_package)
	updated_package = migrate_observation_level(updated_package)

	updated_package = migrate_partitions_field(updated_package)
	
	updated_package = delete_resource_field(updated_package, 'bdm_table', 'entity')
	updated_package = delete_resource_field(updated_package, 'bdm_table', 'identifying_columns')
	updated_package = delete_resource_field(updated_package, 'bdm_table', 'covered_by_dictionary')
	updated_package = delete_resource_field(updated_package, 'bdm_table', 'time_unit')
	updated_package = delete_resource_field(updated_package, 'bdm_table', 'formato')

	updated_package = migrate_data_cleaning_code_url(updated_package)

	updated_package = delete_resource_field(updated_package, 'external_link', 'entity')
	updated_package = delete_resource_field(updated_package, 'external_link', 'identifying_columns')
	updated_package = delete_resource_field(updated_package, 'external_link', 'time_unit')
	updated_package = delete_resource_field(updated_package, 'external_link', 'formato')
	
	updated_package = migrate_time_unit(updated_package)

	updated_package = replace_missing_column_fields(updated_package)

	updated_packages.append(updated_package)
	
	if updated_package['name'] != 'br-ibge-censo-demografico':
		migrator = Migrator(ckan_remote, updated_package)
	else:
		censo = copy.deepcopy(updated_package)
		censo['resources'] = censo['resources'][0:29] # waiting to solve issue #website/219
		migrator = Migrator(ckan_remote, censo)
	
	migrator.validate()
	migrator.update()

	

In [None]:

for k, package in enumerate(local_packages):
	if package['name'] == 'br-ibge-censo-demografico':
		print(k)

p = local_packages[47]
print(p['name'])
print(p['resources'][0]['name'])
print(p['resources'][0]) #['observation_level'])

pkg = updated_packages[0]
print(pkg['name'])
print(pkg['resources'][1]['name'])
pkg['resources'][1]['spatial_coverage'] #['entity']

In [None]:

for package in updated_packages:
	if package['name'] == 'br-ms-imunizacoes':
		for resource in package['resources']:
			print(resource['temporal_coverage'])