### Importa módulos necessários

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

### Inicia sessão Spark

In [318]:
spark = (
    SparkSession.builder.master("local[*]")
    .appName("ml-model")
    .config("spark.ui.port", "4040")
    .config("spark.ui.showConsoleProgress", "True")
    .getOrCreate()
)

### Carrega tabela da camada bronze

In [319]:
silver_path = "/home/jovyan/data/silver/features.parquet"
df = spark.read.parquet(silver_path)

In [320]:
df.show(truncate=False)

+---+----------------+---------------+------------+---------+----------+----------+-------------+-----------+--------------+---------+
|id |experience_level|employment_type|company_size|job_focus|data_roles|job_type  |usa_residence|usa_company|updated_salary|work_year|
+---+----------------+---------------+------------+---------+----------+----------+-------------+-----------+--------------+---------+
|0  |MI              |FT             |M           |Manager  |1         |Presencial|1            |1          |117400.0      |2024     |
|1  |MI              |FT             |M           |Manager  |1         |Presencial|1            |1          |62620.0       |2024     |
|2  |SE              |FT             |M           |Manager  |1         |Remoto    |1            |1          |131200.0      |2024     |
|3  |SE              |FT             |M           |Manager  |1         |Remoto    |1            |1          |95300.0       |2024     |
|4  |SE              |FT             |M           |Othe

### Pipeline para treino do modelo

In [321]:
(df_X, df_Y) = df.randomSplit([0.8, 0.2], seed=42)

In [322]:
df_X.count()

51313

In [323]:
df_Y.count()

13010

In [324]:
cat_features = [
    "experience_level",
    "employment_type",
    "company_size",
    "job_focus",
    "data_roles",
    "job_type",
    "usa_residence",
    "usa_company",
]

In [325]:
string_indexer = StringIndexer(
    inputCols=cat_features,
    outputCols=[f"{feature}_cat" for feature in cat_features],
)

In [326]:
encoder = OneHotEncoder(
    inputCols=[f"{feature}_cat" for feature in cat_features],
    outputCols=[f"{feature}_vec" for feature in cat_features],
)

In [327]:
assembler = VectorAssembler(
    inputCols=[f"{feature}_vec" for feature in cat_features],
    outputCol="features",
)

In [350]:
rfr = RandomForestRegressor(featuresCol="features", labelCol="updated_salary")

In [351]:
gbt = GBTRegressor(featuresCol="features", labelCol="updated_salary")

In [360]:
pipe = Pipeline(stages=[string_indexer, encoder, assembler, gbt])

In [361]:
result = pipe.fit(df_X).transform(df_Y)

In [362]:
result.show(truncate=False)

+---+----------------+---------------+------------+---------+----------+----------+-------------+-----------+--------------+---------+--------------------+-------------------+----------------+-------------+--------------+------------+-----------------+---------------+--------------------+-------------------+----------------+-------------+--------------+-------------+-----------------+---------------+-------------------------------------------------------------+------------------+
|id |experience_level|employment_type|company_size|job_focus|data_roles|job_type  |usa_residence|usa_company|updated_salary|work_year|experience_level_cat|employment_type_cat|company_size_cat|job_focus_cat|data_roles_cat|job_type_cat|usa_residence_cat|usa_company_cat|experience_level_vec|employment_type_vec|company_size_vec|job_focus_vec|data_roles_vec|job_type_vec |usa_residence_vec|usa_company_vec|features                                                     |prediction        |
+---+----------------+--------

In [363]:
result.count()

13010

In [364]:
rmse = RegressionEvaluator(
    labelCol="updated_salary", predictionCol="prediction", metricName="rmse"
)

In [365]:
rmse.evaluate(result)

57117.27488223516

In [366]:
r2 = RegressionEvaluator(
    labelCol="updated_salary", predictionCol="prediction", metricName="r2"
)

In [367]:
r2.evaluate(result)

0.2551152783558289

In [None]:
faixas_dados = (
    result.filter(F.col("data_roles") == 1)
    .groupBy("job_focus")
    .agg(
        F.min("prediction").alias("lb_salario"),
        F.mean("prediction").alias("mean_salario"),
        F.max("prediction").alias("ub_salario"),
    )
    .sort("job_focus")
)

In [371]:
faixas_dados.show(truncate=False)

+---------------------+-----------------+------------------+------------------+
|job_focus            |lb_salario       |mean_salario      |ub_salario        |
+---------------------+-----------------+------------------+------------------+
|Analyst              |41879.17359257814|109053.83029900285|174984.32166776783|
|Architect            |89422.32702814054|163890.83869441878|205307.7067943078 |
|Business Intelligence|57309.06859128614|124574.8929906619 |183004.07568547555|
|Developer            |50914.96101979874|143369.26675195948|167900.8470643952 |
|Engineer             |36368.32504023424|165418.89422065698|209352.91054837196|
|Manager              |46089.83007462198|123112.70553721544|165959.06285841126|
|Other                |51362.61568624312|130161.00041596184|222204.524778267  |
|Scientist            |42818.96959512986|158789.6871868487 |211421.54680751154|
+---------------------+-----------------+------------------+------------------+



### Armazena tabela na camada gold

In [372]:
gold_path = "/home/jovyan/data/gold/faixas_cargos.parquet"
faixas_dados.write.mode("overwrite").parquet(gold_path)