### Importa módulos necessários

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

### Criando sessão Spark

In [2]:
spark = (
    SparkSession.builder.master("local[*]")
    .appName("ingestion")
    .config("spark.ui.port", "4040")
    .config("spark.ui.showConsoleProgress", "True")
    .getOrCreate()
)

### Para abrir a interface do Spark UI, é necessário digitar localhost:4040 no browser

In [3]:
spark

### Ler arquivo da camada raw

In [4]:
raw_path = "/home/jovyan/data/raw/salaries.csv"
df = spark.read.options(header=True).csv(raw_path)

### Exibindo os dados

In [5]:
df.show(truncate=False)

+---------+----------------+---------------+-------------------+------+---------------+-------------+------------------+------------+----------------+------------+
|work_year|experience_level|employment_type|job_title          |salary|salary_currency|salary_in_usd|employee_residence|remote_ratio|company_location|company_size|
+---------+----------------+---------------+-------------------+------+---------------+-------------+------------------+------------+----------------+------------+
|2024     |MI              |FT             |Data Manager       |117400|USD            |117400       |US                |0           |US              |M           |
|2024     |MI              |FT             |Data Manager       |62620 |USD            |62620        |US                |0           |US              |M           |
|2024     |SE              |FT             |Data Manager       |131200|USD            |131200       |US                |100         |US              |M           |
|2024     |SE   

In [6]:
df.printSchema()

root
 |-- work_year: string (nullable = true)
 |-- experience_level: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- salary_currency: string (nullable = true)
 |-- salary_in_usd: string (nullable = true)
 |-- employee_residence: string (nullable = true)
 |-- remote_ratio: string (nullable = true)
 |-- company_location: string (nullable = true)
 |-- company_size: string (nullable = true)



In [7]:
len(df.columns)

11

### Definindo schema da tabela para salvar dados corretamente na camada bronze

In [8]:
df_schema = T.StructType(
    [
        T.StructField("work_year", T.IntegerType(), True),
        T.StructField("experience_level", T.StringType(), True),
        T.StructField("employment_type", T.StringType(), True),
        T.StructField("job_title", T.StringType(), True),
        T.StructField("salary", T.IntegerType(), True),
        T.StructField("salary_currency", T.StringType(), True),
        T.StructField("salary_in_usd", T.IntegerType(), True),
        T.StructField("employee_residence", T.StringType(), True),
        T.StructField("remote_ratio", T.IntegerType(), True),
        T.StructField("company_location", T.StringType(), True),
        T.StructField("company_size", T.StringType(), True),
    ]
)

In [9]:
df = spark.read.schema(df_schema).options(header=True).csv(raw_path)

In [10]:
df.show(truncate=False)

+---------+----------------+---------------+-------------------+------+---------------+-------------+------------------+------------+----------------+------------+
|work_year|experience_level|employment_type|job_title          |salary|salary_currency|salary_in_usd|employee_residence|remote_ratio|company_location|company_size|
+---------+----------------+---------------+-------------------+------+---------------+-------------+------------------+------------+----------------+------------+
|2024     |MI              |FT             |Data Manager       |117400|USD            |117400       |US                |0           |US              |M           |
|2024     |MI              |FT             |Data Manager       |62620 |USD            |62620        |US                |0           |US              |M           |
|2024     |SE              |FT             |Data Manager       |131200|USD            |131200       |US                |100         |US              |M           |
|2024     |SE   

In [11]:
df.printSchema()

root
 |-- work_year: integer (nullable = true)
 |-- experience_level: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- salary_currency: string (nullable = true)
 |-- salary_in_usd: integer (nullable = true)
 |-- employee_residence: string (nullable = true)
 |-- remote_ratio: integer (nullable = true)
 |-- company_location: string (nullable = true)
 |-- company_size: string (nullable = true)



In [12]:
df.count()

64934

### Persistindo na camada bronze

In [13]:
bronze_path = "/home/jovyan/data/bronze/salaries.parquet"
df.write.partitionBy("work_year").mode("overwrite").parquet(bronze_path)

### Encerrando sessão Spark

In [14]:
spark.stop()