Autor: Adolfo Eliazat

Assunto: Ingestão dos arquivos csvs com dados de transações

Atualizações:

In [43]:
import requests, json 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from datetime import datetime, date, timedelta
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

StatementMeta(, , , Waiting, )

In [44]:
spark = SparkSession.builder \
    .master('local') \
    .appName('NOTEBOOK_SPOTPASS_LOAD_API_EVENTS_FULL') \
    .config('spark.executor.memory', '5gb') \
    .config("spark.cores.max", "6") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .config("park.sql.parquet.datetimeRebaseModeInWrite", "LEGACY") \
    .config("spark.microsoft.delta.optimizeWrite.enabled", "true") \
    .config("spark.sql.parquet.vorder.enabled", "true") \
    .getOrCreate()

StatementMeta(, , , Waiting, )

In [45]:
write_setup = {
    "compression": "gzip",  # Use "gzip" em vez de "snappy" para a compressão
    "maxRecordsPerFile": 500000,  # Limita o número de registros por arquivo
    "spark.sql.files.maxPartitionBytes": "128m",  # Define o tamanho máximo de cada partição
    "spark.sql.parquet.output.committer.class": "org.apache.spark.sql.parquet.DirectParquetOutputCommitter"  # Usa um commiter mais consistente
}

StatementMeta(, , , Waiting, )

**Configurações de data e hora**

In [46]:
date_time = ( datetime.now() - timedelta(days=1))
date = date_time.strftime('%Y-%m-%d')

StatementMeta(, , , Waiting, )

In [49]:
delta_table_name = "Tables/spotpass_events"

StatementMeta(, , , Waiting, )

In [50]:
# Define the headers
headers = {
    "accept": "application/json",
    "content-type": "application/json",
    "AUTHORIZATION": "QM6Gv1kKYzHzhkvPzjbYmzGz"
}

StatementMeta(, , , Waiting, )

In [51]:

# Define the URL
url = "https://public.api.spotpass.com.br/events"

# Make the GET request
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse JSON response
    response_data = json.loads(response.text)
    #print("response_data:", response_data)

    # Select a specific attribute and store it in a variable
    dados = response_data.get("content", None)
    print("content:", dados)

else:
    print(f"Error: {response.status_code}")


StatementMeta(, , , Waiting, )

content: [{'id': '0555ade3-dcde-4c62-87bd-238c251c350e', 'name': 'BAR 1 ', 'start_date': '2022-07-06T06:00:00.000-03:00', 'end_date': '2022-07-10T23:00:00.000-03:00', 'created_at': '2022-07-05T12:08:22.690-03:00', 'updated_at': '2022-07-06T12:13:10.827-03:00', 'client': {'id': 28, 'legal_type': 'legal_person', 'document': '10389092000105', 'name': 'EL DOURADO FEIRAS E EVENTOS LTDA', 'telephone': '1150312017', 'company_name': 'MARCO OU JOÃO', 'state_registration': '3.817.604-1', 'person_id': None, 'zipcode': '04301002', 'street': 'Avenida Miguel Estefno', 'number': '3900', 'complement': 'MEZANINO SALA 01', 'neighborhood': 'Saúde', 'city': 'São Paulo', 'state': 'SP', 'country': 'Brasil', 'email': 'financeiro@grupoled.com.br', 'created_at': '2021-09-16T15:14:08.890-03:00'}}, {'id': '05c3150b-4fb6-45e2-9e75-fb5b306b4d0d', 'name': 'BAR 3 ', 'start_date': '2023-02-11T06:00:00.000-03:00', 'end_date': '2023-02-13T06:00:00.000-03:00', 'created_at': '2023-02-09T14:46:45.822-03:00', 'updated_at':

In [None]:
# Define the schema for the JSON data
schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("start_date", StringType(), True),
    StructField("end_date", StringType(), True),
    StructField("created_at", StringType(), True),
    StructField("updated_at", StringType(), True),
    StructField("client", StructType([
        StructField("id", IntegerType(), True),
        StructField("legal_type", StringType(), True),
        StructField("document", StringType(), True),
        StructField("name", StringType(), True),
        StructField("telephone", StringType(), True),
        StructField("company_name", StringType(), True),
        StructField("state_registration", StringType(), True),
        StructField("person_id", StringType(), True),
        StructField("zipcode", StringType(), True),
        StructField("street", StringType(), True),
        StructField("number", StringType(), True),
        StructField("complement", StringType(), True),
        StructField("neighborhood", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("country", StringType(), True),
        StructField("email", StringType(), True),
        StructField("created_at", StringType(), True)
    ]))
])

StatementMeta(, , , Waiting, )

In [None]:
# Function to fetch data for a given page
def fetch_data(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        print(f"Error fetching data from {url}")
        return None

# Function to extract events from response data
def extract_events(response_data):
    if response_data is None:
        return []
    return response_data.get("content", [])

StatementMeta(, , , Waiting, )

In [None]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("API Pagination") \
    .getOrCreate()

# Define the base URL
base_url = "https://public.api.spotpass.com.br/events"

# Define the headers
headers = {
    "accept": "application/json",
    "content-type": "application/json",
    "AUTHORIZATION": "QM6Gv1kKYzHzhkvPzjbYmzGz"
}

# List to store all events
all_events = []

# Variable to track pagination
page = 1

# Loop to fetch all pages
while True:
    url = f"{base_url}?page={page}"
    print(f"Fetching data from {url}")

    # Fetch data for the current page
    response_data = fetch_data(url)
    
    # Extract events from the response
    events = extract_events(response_data)
    
    # If no events are returned, break the loop
    if not events:
        break
    
    # Extend the list of all events with the events from the current page
    all_events.extend(events)
    
    # Increment page for the next iteration
    page += 1

# Create DataFrame from all events
df_events = spark.createDataFrame(all_events, schema)

# Access client fields directly
df_events_exploded = df_events.select(
        "id",
        "name",
        "start_date",
        "end_date",
        "created_at",
        "updated_at",
        col("client.id").alias("client_id"),
        col("client.legal_type").alias("client_legal_type"),
        col("client.document").alias("client_document"),
        col("client.name").alias("client_name"),
        col("client.telephone").alias("client_telephone"),
        col("client.company_name").alias("client_company_name"),
        col("client.state_registration").alias("client_state_registration"),
        col("client.person_id").alias("client_person_id"),
        col("client.zipcode").alias("client_zipcode"),
        col("client.street").alias("client_street"),
        col("client.number").alias("client_number"),
        col("client.complement").alias("client_complement"),
        col("client.neighborhood").alias("client_neighborhood"),
        col("client.city").alias("client_city"),
        col("client.state").alias("client_state"),
        col("client.country").alias("client_country"),
        col("client.email").alias("client_email"),
        col("client.created_at").alias("client_created_at")
    )
# Show the DataFrame
#df_events_exploded.show(truncate=False)


StatementMeta(, , , Waiting, )

Fetching data from https://public.api.spotpass.com.br/events?page=1


Fetching data from https://public.api.spotpass.com.br/events?page=2
Fetching data from https://public.api.spotpass.com.br/events?page=3


Fetching data from https://public.api.spotpass.com.br/events?page=4
Fetching data from https://public.api.spotpass.com.br/events?page=5


Fetching data from https://public.api.spotpass.com.br/events?page=6
Fetching data from https://public.api.spotpass.com.br/events?page=7
Fetching data from https://public.api.spotpass.com.br/events?page=8


Fetching data from https://public.api.spotpass.com.br/events?page=9
Fetching data from https://public.api.spotpass.com.br/events?page=10
Fetching data from https://public.api.spotpass.com.br/events?page=11


+------------------------------------+------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+---------+-----------------+---------------+--------------------------------+----------------+-------------------+-------------------------+----------------+--------------+----------------------+-------------+-----------------+-------------------+-----------+------------+--------------+--------------------------+-----------------------------+
|id                                  |name                    |start_date                   |end_date                     |created_at                   |updated_at                   |client_id|client_legal_type|client_document|client_name                     |client_telephone|client_company_name|client_state_registration|client_person_id|client_zipcode|client_street         |client_number|client_complement|client_neighborhood|client_city|client_state|client_country|cl

In [52]:
display(df_events_exploded)

StatementMeta(, , , Waiting, )

SynapseWidget(Synapse.DataFrame, f42da860-c031-4814-854a-e685c0cf54e8)

In [53]:
# Assuming you already have the DataFrame loaded as 'df'
row_count = df_events_exploded.count()

# Print the row count
print("Accounts count: {}".format(row_count))

StatementMeta(, , , Waiting, )

Accounts count: 150


In [54]:
if not df_events_exploded.isEmpty():
    df_events_exploded.write.format("delta")\
    .options(**write_setup)\
    .option("mergeSchema", "true")\
    .option("parquet.vorder.enabled", "force_true")\
    .mode("overwrite")\
    .save(delta_table_name)

StatementMeta(, , , Waiting, )

In [55]:
%%sql
SELECT count(*) 
FROM LAKEHOUSE_ELDOURADO.spotpass_events

StatementMeta(, , , Waiting, )

InterpreterError: Fail to start interpreter.
detail: requirement failed: SparkContext has been stopped
error message: 


In [56]:
%%sql
SELECT * 
FROM LAKEHOUSE_ELDOURADO.'{delta_table_name}'

StatementMeta(, , , Waiting, )

InterpreterError: Fail to start interpreter.
detail: requirement failed: SparkContext has been stopped
error message: 
