# Camada Bronze

## Importações

In [0]:
%python
import requests
from io import BytesIO
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

## Spark Session

In [0]:
%python
spark = SparkSession.builder.getOrCreate()

## Token de autentificação

In [0]:
%python
dbutils.widgets.text("token", "", "token")
token = dbutils.widgets.get("token")

Leitura e sobrescrita de tabela existente

In [0]:
%sql
drop table if exists workspace.projeto_vendas_bronze.bronze_vendas
     

## Mapeamento das tabelas

In [0]:
%python
tables = {
  "customers":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_customers_dataset.csv",
  "geolocation":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_geolocation_dataset.csv",
  "order_items":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_order_items_dataset.csv",
  "payment":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_order_payments_dataset.csv",
  "review":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_order_reviews_dataset.csv",
  "orders":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_orders_dataset.csv",
  "products":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_products_dataset.csv",
  "seller":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/olist_sellers_dataset.csv",
  "category":"https://raw.githubusercontent.com/albvieiraa/project_sales/main/raw-data/product_category_name_translation.csv",

}

## Ingestão

In [0]:
%python
for table_name, url in tables.items():
    
    print(f"Ingerindo tabela: {table_name}")
    
    response = requests.get(url)
    response.raise_for_status()
    
    # Leitura do CSV com Pandas
    df_pd = pd.read_csv(BytesIO(response.content))
    
    # Padronização técnica (Bronze)
    df_pd = df_pd.astype(str)
    
    # Conversão para Spark
    df_spark = spark.createDataFrame(df_pd)
    
    # Escrita na camada Bronze
    (
        df_spark.write
        .mode("overwrite")
        .format("delta")
        .saveAsTable(f"projeto_vendas_bronze.{table_name}")
    )


## Visualizando as tabelas

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.customers

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.customers
WHERE customer_state = 'PE'

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.orders

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.orders
WHERE order_status != 'canceled' 
  AND order_status != 'delivered'

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.order_items

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.order_items
ORDER BY price DESC;

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.products

In [0]:
%sql
SELECT * FROM workspace.projeto_vendas_bronze.products
WHERE product_category_name = 'beleza_saude'
LIMIT 10;

In [0]:
%sql
SELECT DISTINCT product_category_name FROM workspace.projeto_vendas_bronze.products;

In [0]:
%sql
SELECT DISTINCT order_status FROM workspace.projeto_vendas_bronze.orders