# Imports

In [None]:
!pip install pyspark

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, expr

# Configs

In [57]:
# Initializing SparkSession
spark = SparkSession.builder.appName('Case Oper').getOrCreate()

# Defining the path to the JSON file
path_json = '/content/drive/MyDrive/Colab Notebooks/teste-tecnico-main/base_desafio.json'
path_output = '/content/drive/MyDrive/Colab Notebooks/teste-tecnico-main/outputs/second_question'

In [52]:
# Reading JSON file into a Spark DataFrame, allowing multiline JSON objects
df = spark.read.option("multiline","true") .json(path_json)

# Second question

In [54]:
# Selecting and exploding the 'items' array from the DataFrame 'df'
items_df = df.select(explode("resource.items").alias("item"))

In [55]:
# Selecting specific columns from the 'items_df' DataFrame and renaming them to follow snake_case naming convention

items_df = items_df.select(
    col("item.name").alias("name"),
    col("item.lastMessageDate").alias("last_message_date"),
    col("item.identity").alias("identity"),
    col("item.phoneNumber").alias("phone_number"),
    col("item.source").alias("source"),
    col("item.extras.protocoloWci").alias("extras_protocolo_wci"),
    col("item.extras.UtmCampaign").alias("extras_utm_campaign"),
    col("item.extras.applicationIdentifier").alias("extras_application_identifier"),
    col("item.extras.primeiraMensagem").alias("extras_primeira_mensagem"),
    col("item.extras.produto").alias("extras_produto"),
    col("item.extras.`1. Boas vindas`").alias("extras_etapa_boas_vindas"),
    col("item.extras.prioridade").alias("extras_prioridade"),
    col("item.extras.`2. Cep`").alias("extras_etapa_cep"),
    col("item.extras.canal").alias("extras_canal"),
    col("item.extras.`99. Abandono`").alias("extras_etapa_abandono")
)

In [56]:
# Converting columns starting with 'extras_etapa_' to boolean type in the DataFrame 'items_df'
for column in items_df.columns:
    if column.startswith("extras_etapa_"):
        items_df = items_df.withColumn(column, expr(f"cast({column} as boolean)"))

In [49]:
#Only to show
items_df.show(truncate=False)

+---------+------------------------+---------------------------+---------------------+----------+---------------------+---------------------+-----------------------------+----------------------------------------------------------------------------+--------------+------------------------+-----------------+----------------+-------------+---------------------+
|name     |last_message_date       |identity                   |phone_number         |source    |extras_protocolo_wci |extras_utm_campaign  |extras_application_identifier|extras_primeira_mensagem                                                    |extras_produto|extras_etapa_boas_vindas|extras_prioridade|extras_etapa_cep|extras_canal |extras_etapa_abandono|
+---------+------------------------+---------------------------+---------------------+----------+---------------------+---------------------+-----------------------------+----------------------------------------------------------------------------+--------------+-----------------

In [50]:
#Only to show
items_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- last_message_date: string (nullable = true)
 |-- identity: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- source: string (nullable = true)
 |-- extras_protocolo_wci: string (nullable = true)
 |-- extras_utm_campaign: string (nullable = true)
 |-- extras_application_identifier: string (nullable = true)
 |-- extras_primeira_mensagem: string (nullable = true)
 |-- extras_produto: string (nullable = true)
 |-- extras_etapa_boas_vindas: boolean (nullable = true)
 |-- extras_prioridade: long (nullable = true)
 |-- extras_etapa_cep: boolean (nullable = true)
 |-- extras_canal: string (nullable = true)
 |-- extras_etapa_abandono: boolean (nullable = true)



In [63]:
items_df.repartition(1).write.mode("overwrite").parquet(f"{path_output}/items_df.parquet")