In [1]:
import pyspark
print(pyspark.__version__)


3.5.0


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark_server = "spark://spark-master-otmzsp:7077"

In [4]:
# Configurando a sessão do Spark com as dependências e variáveis do S3A
# .master(spark_server) \
# .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
spark = SparkSession.builder \
    .appName("MinIO Example - server - 2") \
    .master(spark_server) \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.1.0,org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio-otmzsp:9000") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

In [5]:
spark

In [6]:
raw = 's3a://raw/posicoes'
#raw = 's3a://raw/posicoes'
trusted = 's3a://trusted/posicoes'

In [7]:
!python --version

Python 3.11.6


In [9]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType, TimestampType, FloatType

# Defining the schema for the JSON
schema = StructType([StructField("hr", StringType(), True),
                     StructField("l", ArrayType(StructType([StructField("c", StringType(), True),
                                                            StructField("cl", IntegerType(), True),
                                                            StructField("sl", IntegerType(), True),
                                                            StructField("lt0", StringType(), True),
                                                            StructField("lt1", StringType(), True),
                                                            StructField("qv", IntegerType(), True),
                                                            StructField("vs", ArrayType(StructType([StructField("p", IntegerType(), True),
                                                                                                    StructField("a", BooleanType(), True),
                                                                                                    StructField("ta", TimestampType(), True),
                                                                                                    StructField("py", FloatType(), True),
                                                                                                    StructField("px", FloatType(), True),
                                                                                                    StructField("sv", StringType(), True),
                                                                                                    StructField("is", StringType(), True)
                                                                                                    ])
                                                                                        )
                                                                        )
                                                            ])
                                                )
                                )
                    ])





In [10]:
df = spark.read.schema(schema).json(raw)

In [11]:
df

DataFrame[hr: string, l: array<struct<c:string,cl:int,sl:int,lt0:string,lt1:string,qv:int,vs:array<struct<p:int,a:boolean,ta:timestamp,py:float,px:float,sv:string,is:string>>>>, datepartition: date]

In [12]:
df.show()

+-----+--------------------+-------------+
|   hr|                   l|datepartition|
+-----+--------------------+-------------+
|08:11|[{3902-10, 34238,...|   2024-09-18|
|08:21|[{3902-10, 34238,...|   2024-09-18|
|08:15|[{3902-10, 34238,...|   2024-09-18|
|08:17|[{3902-10, 34238,...|   2024-09-18|
|08:07|[{3902-10, 34238,...|   2024-09-18|
|08:13|[{3902-10, 34238,...|   2024-09-18|
|08:09|[{3902-10, 34238,...|   2024-09-18|
|08:23|[{3902-10, 34238,...|   2024-09-18|
|08:27|[{3902-10, 34238,...|   2024-09-18|
|08:31|[{3902-10, 34238,...|   2024-09-18|
|08:19|[{3902-10, 34238,...|   2024-09-18|
|08:29|[{3902-10, 34238,...|   2024-09-18|
|08:39|[{3902-10, 1470, ...|   2024-09-18|
|08:35|[{3902-10, 1470, ...|   2024-09-18|
|08:37|[{3902-10, 1470, ...|   2024-09-18|
|08:41|[{3902-10, 1470, ...|   2024-09-18|
|08:43|[{3902-10, 1470, ...|   2024-09-18|
|08:49|[{3902-10, 1470, ...|   2024-09-18|
|08:51|[{3902-10, 1470, ...|   2024-09-18|
|08:53|[{3902-10, 1470, ...|   2024-09-18|
+-----+----

In [14]:
from pyspark.sql.functions import explode, col

# Explodir a coluna 'l' para acessar os campos dentro dela
df_exploded = df.select("hr", explode("l").alias("linha"))

# Selecionar e renomear todos os campos relevantes
df_raw_posicao = df_exploded.select(
    col("hr").alias("veiculo_horario_referencia"),
    col("linha.c").alias("veiculo_letreiro_completo"),
    col("linha.cl").alias("veiculo_linha_codigo"),
    col("linha.sl").alias("veiculo_sentido"),
    col("linha.lt0").alias("veiculo_letreiro_destino"),
    col("linha.lt1").alias("veiculo_letreiro_origem"),
    col("linha.qv").alias("veiculo_quantidade"),
    explode("linha.vs").alias("veiculo")
).select(
    "*",
    col("veiculo.p").alias("veiculo_prefixo"),
    col("veiculo.a").alias("veiculo_acessibilidade"),
    col("veiculo.ta").alias("veiculo_horario_utc_captura"),
    col("veiculo.py").alias("veiculo_latitude"),
    col("veiculo.px").alias("veiculo_longitude")
).drop("linha", "veiculo")

# Mostrar o esquema do DataFrame final
print("Esquema do DataFrame final:")
df_raw_posicao.printSchema()

# Mostrar algumas linhas do DataFrame final
print("\nAmostras do DataFrame final:")
df_raw_posicao.show(100,False)

Esquema do DataFrame final:
root
 |-- veiculo_horario_referencia: string (nullable = true)
 |-- veiculo_letreiro_completo: string (nullable = true)
 |-- veiculo_linha_codigo: integer (nullable = true)
 |-- veiculo_sentido: integer (nullable = true)
 |-- veiculo_letreiro_destino: string (nullable = true)
 |-- veiculo_letreiro_origem: string (nullable = true)
 |-- veiculo_quantidade: integer (nullable = true)
 |-- veiculo_prefixo: integer (nullable = true)
 |-- veiculo_acessibilidade: boolean (nullable = true)
 |-- veiculo_horario_utc_captura: timestamp (nullable = true)
 |-- veiculo_latitude: float (nullable = true)
 |-- veiculo_longitude: float (nullable = true)


Amostras do DataFrame final:
+--------------------------+-------------------------+--------------------+---------------+------------------------+-----------------------+------------------+---------------+----------------------+---------------------------+----------------+-----------------+
|veiculo_horario_referencia|veiculo_