In [1]:
import requests
import pandas as pd
from time import sleep
class CkanConsumer:
    def __init__(self, main_url:str, secure:bool = True):
        if secure:
            self.url = f'https://{main_url}/api/action/datastore_search'
        else:
            self.url = f'http://{main_url}/api/action/datastore_search'

    def request(self, resource_id) -> pd.DataFrame:
        params = {
            'limit': 1000,
            'offset': 0,
            'resource_id': resource_id
        }
        data = []

        print(f'[INFO] - Getting data from {self.url} on resource {resource_id} at offset {len(data)}')
        response = requests.get(self.url, params=params)
        total = response.json()['result']['total']
        data.extend(response.json()['result']['records'])

        while len(data) < total:
            params['offset'] = len(data)
            print(f'[INFO] - Getting data from {self.url} on resource {resource_id} at offset {len(data)}')
            response = requests.get(self.url, params=params)
            while response.status_code != 200:
                print(f'[ERROR] - Response code {response.status_code} from {self.url} on resource {resource_id} at offset {len(data)}')
                print(f'[INFO] - Waiting 5 seconds before trying again')
                sleep(5)
                response = requests.get(self.url, params=params)
            data.extend(response.json()['result']['records'])
        
        print(f'[INFO] - Total of {len(data)} records retrieved from {self.url} on resource {resource_id}')
        #Reassembling the dataframe
        df = pd.DataFrame(data)
        return df

In [13]:
import pandas as pd
from hashlib import md5

def mapper_discentes(
        dataframe: pd.DataFrame,
        instituto: str,
        resource_id: str,
        nome_discente: str,
        matricula: str,
        sexo: str,
        ano_ingresso: str,
        periodo_ingresso: str,
        nome_curso: str
    ) -> pd.DataFrame:
    #create collum if not exists
    for col in [sexo, ano_ingresso, periodo_ingresso, nome_curso]:
        if col not in dataframe.columns:
            dataframe[col] = 0
    result = dataframe.apply(lambda df: pd.Series({
        'nome': str(df[nome_discente]).upper(),
        'id': md5((f"{df[matricula]}{instituto}{resource_id}").encode()).hexdigest(),
        'matricula': str(df[matricula]),
        'sexo': df[sexo],
        'data_ingresso': f"{str(df[ano_ingresso])}.{str(df[periodo_ingresso])}",
        'curso': df[nome_curso]
    }), axis = 1)
    return result

def mapper_docentes(
        dataframe: pd.DataFrame,
        instituto: str,
        resource_id: str,
        nome_docente: str,
        siape: str,
        sexo: str,
        ano_ingresso: str,
        periodo_ingresso: str,
        lotacao: str
    ) -> pd.DataFrame:
    for col in [sexo, ano_ingresso, periodo_ingresso, lotacao]:
        if col not in dataframe.columns:
            dataframe[col] = 0
    result = dataframe.apply(lambda df: pd.Series({
        'nome': str(df[nome_docente]).upper(),
        'id': md5((f"{df[siape]}{instituto}{resource_id}").encode()).hexdigest(),
        'siape': str(df[siape]),
        'sexo': df[sexo],
        'data_ingresso': f"{str(df[ano_ingresso])}.{str(df[periodo_ingresso])}",
        'lotacao': df[lotacao]
    }), axis = 1)
    return result

def mapper_cursos(
        dataframe: pd.DataFrame,
        instituto: str,
        resource_id: str,
        nome_curso: str,
        id_curso: str
    ) -> pd.DataFrame:
    result = dataframe.apply(lambda df: pd.Series({
        'nome': str(df[nome_curso]).upper(),
        'id': md5((f"{df[nome_curso]}{instituto}{resource_id}").encode()).hexdigest(),
        'codigo': str(df[id_curso]),
        'instituicao': instituto
    }), axis = 1)
    return result

def trunc_date(df: pd.DataFrame, coluna_datetime, type :str) -> pd.DataFrame:
    if type == 'ano_periodo':
        df[coluna_datetime] = pd.to_datetime(df[coluna_datetime], format='%Y.%m',errors='coerce')
        df['ano_ingresso'] = df[coluna_datetime].dt.year
        df['periodo_ingresso'] = df[coluna_datetime].dt.month
        df['periodo_ingresso'] = df['periodo_ingresso'].apply(lambda x: 1 if x == 1 else 2)
    return df

In [18]:
ufca_consumer = CkanConsumer('dados.ufca.edu.br', secure=True)
cursos_ufca = ufca_consumer.request('5f31e620-a366-42c9-a54c-96da666c93b7')
docentes_ufca = ufca_consumer.request('6b2dbca5-58f8-472e-bc6a-eb827e631873')


[INFO] - Getting data from https://dados.ufca.edu.br/api/action/datastore_search on resource 5f31e620-a366-42c9-a54c-96da666c93b7 at offset 0
[INFO] - Total of 25 records retrieved from https://dados.ufca.edu.br/api/action/datastore_search on resource 5f31e620-a366-42c9-a54c-96da666c93b7
[INFO] - Getting data from https://dados.ufca.edu.br/api/action/datastore_search on resource 6b2dbca5-58f8-472e-bc6a-eb827e631873 at offset 0
[INFO] - Getting data from https://dados.ufca.edu.br/api/action/datastore_search on resource 6b2dbca5-58f8-472e-bc6a-eb827e631873 at offset 1000
[INFO] - Total of 1013 records retrieved from https://dados.ufca.edu.br/api/action/datastore_search on resource 6b2dbca5-58f8-472e-bc6a-eb827e631873


In [23]:
docentes_ufca.head()

Unnamed: 0,nome,id,siape,sexo,data_ingresso,lotacao
0,ADRIANA CRISTINA GOMES DE ARAUJO,ab58379b198e3c6ae4047b03cf67d7e1,120****,0,0.0,Universidade Federal do Cariri - CE
1,ALINE RODRIGUES BEZERRA OLIVEIRA,aebdecd06ef171e18ba56db2cc951463,122****,0,0.0,Universidade Federal do Cariri - CE
2,ARAMIDIS CIBELLY MOURA DE MORAIS,f972fca9d7a1cbbe68867bf388b90c1b,192****,0,0.0,Universidade Federal do Cariri - CE
3,ARTUR COSTA DE SOUZA,3ca9e196bd0a363cc19621c07ee0564e,125****,0,0.0,Universidade Federal do Cariri - CE
4,BRENDA PORFIRIO SAMPAIO,aec62dcb2e829ef386a2e4914b5a4ffc,299****,0,0.0,Universidade Federal do Cariri - CE


In [17]:
discentes_ifsm.head()


Unnamed: 0,nome,id,matricula,sexo,data_ingresso,curso
0,ABIAS MACIEL DELVALHE,9bfc393d374a9d435ae338daddec02ed,“5228”,0,2013.0.2,SISTEMAS PARA INTERNET
1,ABMAEL DE ARRUDA CASTOR,11545aceb1bae9c9c6fd348171e9bf2d,22089,0,2017.0.2,OPERADOR DE COMPUTADOR
2,ABRAAO DANIEL PEREIRA DE BRITO,2c19564eb5838b2d198366f7782c0c2e,20689,0,2017.0.2,TÉCNICO EM EDIFICAÇÕES
3,ADAILSON SOARES MARTINS DA SILVA,5048de12f1291a8be8f080f684778018,17228,0,2017.0.2,TÉCNICO EM INFORMÁTICA
4,ADAILTON FERNANDEZ DO NASCIMENTO,24b095dc18d8cfae12260b1508ba6e9e,24244,0,2017.0.2,VENDEDOR
