# Consumo dos dados do serviço 303 da cidade de Boston de 2015 a 2020

* Criando subpasta de dados

In [2]:
!mkdir -p ../data
path_relativo = "../data"

* Dicionário de anos e URLs

In [3]:
urls = {
    2015:{"url": "https://data.boston.gov/datastore/dump/c9509ab4-6f6d-4b97-979a-0cf2a10c922b?bom=True"},
    2016:{"url": "https://data.boston.gov/datastore/dump/b7ea6b1b-3ca4-4c5b-9713-6dc1db52379a?bom=True"},
    2017:{"url": "https://data.boston.gov/datastore/dump/30022137-709d-465e-baae-ca155b51927d?bom=True"},
    2018:{"url": "https://data.boston.gov/datastore/dump/2be28d90-3a90-4af1-a3f6-f28c1e25880a?bom=True"},
    2019:{"url": "https://data.boston.gov/datastore/dump/ea2e4696-4a2d-429c-9807-d02eb92e0222?bom=True"},
    2020:{"url": "https://data.boston.gov/datastore/dump/e6013a93-1321-4f2a-bf91-8d8a02f1e62f?bom=True"},
}

* Função de download

In [4]:
import urllib.request as rq

def extract_data(url, filename):
    try:
        rq.urlretrieve(url, filename)
    except Exception as e:
        print(e)

* Fazendo o download dos dados

In [5]:

for ano in urls.keys():
    url = urls.get(ano)['url']
    file_name = f'{path_relativo}/dados_{ano}.csv'
    urls.get(ano)['file_name'] = file_name
    print(f'Downloading year {ano} from {url} to {file_name}')
    extract_data(url, file_name)

Downloading year 2015 from https://data.boston.gov/datastore/dump/c9509ab4-6f6d-4b97-979a-0cf2a10c922b?bom=True to ../data/dados_2015.csv
Downloading year 2016 from https://data.boston.gov/datastore/dump/b7ea6b1b-3ca4-4c5b-9713-6dc1db52379a?bom=True to ../data/dados_2016.csv
Downloading year 2017 from https://data.boston.gov/datastore/dump/30022137-709d-465e-baae-ca155b51927d?bom=True to ../data/dados_2017.csv
Downloading year 2018 from https://data.boston.gov/datastore/dump/2be28d90-3a90-4af1-a3f6-f28c1e25880a?bom=True to ../data/dados_2018.csv
Downloading year 2019 from https://data.boston.gov/datastore/dump/ea2e4696-4a2d-429c-9807-d02eb92e0222?bom=True to ../data/dados_2019.csv
Downloading year 2020 from https://data.boston.gov/datastore/dump/e6013a93-1321-4f2a-bf91-8d8a02f1e62f?bom=True to ../data/dados_2020.csv


# Ingestão dos dados no S3 através da biblioteca boto3

In [6]:
import boto3

* As credenciais não devem estar no código por questão de segurança

In [None]:
aws_access_key_id = input("aws_access_key_id")
aws_secret_access_key = input("aws_secret_access_key")
region_name = 'us-east-1'

* Criando a sessão padrão do boto com as credenciais

In [8]:
boto3.setup_default_session(
   aws_access_key_id = aws_access_key_id,
   aws_secret_access_key = aws_secret_access_key,
   region_name = region_name    
)

* Criando cliente para o acesso ao serviço AWS S3

In [9]:
s3 = boto3.client('s3')

* Conteúdo para testar a conexão

In [10]:
arquivo_teste = f'{path_relativo}/teste.txt'
content = '''
    Olá AWS S3 !!!
'''
with open(arquivo_teste, 'w+') as f:
    f.write(content)

* Agora vamos colocar no nosso baldinho :) na camada bronze

In [None]:
s3.upload_file(arquivo_teste, 'alura-datalakeaws-carlos', 'bronze/teste')

* Teste bem sucedido, vamos carregar os DFs

In [None]:
import pandas as pd

dfs = {}
for ano in urls.keys():
    file_name = urls.get(ano)['file_name']
    print(file_name)
    dfs[ano] = pd.read_csv(file_name)    

* Com os DFs carregados, vamos converter os dados para o formato Parquet

In [None]:
from io import BytesIO

for ano, df in dfs.items():
    parquet_buffer = BytesIO()
    df.to_parquet(parquet_buffer)
    
    s3.put_object(
        Bucket='alura-datalakeaws-carlos',
        Key=f'bronze/dados_{ano}.parquet',
        Body=parquet_buffer.getvalue()        
    )        