In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

import matplotlib.pyplot as plt#
import pandas as pd
# importações 

In [4]:
# Cria SparkSession 
spark = (
    SparkSession.builder
    .appName("TraseBrazilBeef-ML")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .getOrCreate()
)

spark


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/29 21:02:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/29 21:02:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/11/29 21:02:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
BUCKET = "dados-supplychain"
BASE_PATH = f"s3a://{BUCKET}/trase/brazil_beef"
SILVER_STATE_PATH = f"{BASE_PATH}/silver/flows_by_state"

df_state = spark.read.parquet(SILVER_STATE_PATH)

df_state.printSchema()
df_state.select(
    "year",
    "state_of_production",
    "total_volume_t",
    "total_cattle_defor_ha",
    "total_co2_gross_tco2"
).show(10, truncate=False)


25/11/29 21:02:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

root
 |-- year: integer (nullable = true)
 |-- state_of_production: string (nullable = true)
 |-- state_of_production_trase_id: string (nullable = true)
 |-- total_volume_t: double (nullable = true)
 |-- total_fob_usd: double (nullable = true)
 |-- total_cattle_defor_ha: double (nullable = true)
 |-- total_co2_gross_tco2: double (nullable = true)
 |-- total_co2_net_tco2: double (nullable = true)
 |-- total_pasture_area_ha: double (nullable = true)
 |-- avg_forest500_beef: double (nullable = true)
 |-- sum_zdc_flag: double (nullable = true)
 |-- zdc_volume_t: double (nullable = true)
 |-- zdc_share_volume: double (nullable = true)



                                                                                

+----+-------------------+------------------+---------------------+--------------------+
|year|state_of_production|total_volume_t    |total_cattle_defor_ha|total_co2_gross_tco2|
+----+-------------------+------------------+---------------------+--------------------+
|2020|TOCANTINS          |94935.75596838303 |113251.47184709366   |1.8215550543871008E7|
|2020|PARA               |171823.01349184045|116706.75844987041   |6.203819009127608E7 |
|2020|SAO PAULO          |466634.7965043159 |511.8384858391535    |179493.36307918283  |
|2017|ESPIRITO SANTO     |6576.719122505074 |390.62208684050574   |162484.3996349078   |
|2020|PARANA             |17456.427153614262|120.95889784138247   |50856.65012591888   |
|2023|TOCANTINS          |111007.5952635722 |87997.82798924933    |1.360351769937709E7 |
|2020|MARANHAO           |5468.455430119726 |4069.7946418945994   |1139182.83533762    |
|2020|RONDONIA           |188693.9138590178 |80293.18621830289    |3.6471042166826844E7|
|2017|RIO GRANDE DO S

In [6]:
# Garante tipo numérico
df_state = df_state.withColumn(
    "total_cattle_defor_ha",
    F.col("total_cattle_defor_ha").cast("double")
)

# Calcula mediana aproximada de desmatamento
median_defor = df_state.approxQuantile("total_cattle_defor_ha", [0.5], 0.01)[0]
print("Mediana de desmatamento (ha):", median_defor)

# Cria coluna label
df_ml = df_state.withColumn(
    "label",
    F.when(F.col("total_cattle_defor_ha") >= median_defor, 1).otherwise(0)
)

df_ml.select(
    "year",
    "state_of_production",
    "total_cattle_defor_ha",
    "label"
).orderBy("total_cattle_defor_ha", ascending=False).show(10, truncate=False)


Mediana de desmatamento (ha): 766.9759772173004
+----+-------------------+---------------------+-----+
|year|state_of_production|total_cattle_defor_ha|label|
+----+-------------------+---------------------+-----+
|2023|UNKNOWN STATE      |6867003.686805465    |1    |
|2022|UNKNOWN STATE      |5660588.954519147    |1    |
|2019|UNKNOWN STATE      |5106863.1082406035   |1    |
|2021|UNKNOWN STATE      |5039183.704432643    |1    |
|2020|UNKNOWN STATE      |4934323.666656295    |1    |
|2017|UNKNOWN STATE      |4395232.150147333    |1    |
|2016|UNKNOWN STATE      |4333790.607647422    |1    |
|2015|UNKNOWN STATE      |4302173.113117231    |1    |
|2022|MATO GROSSO        |196154.1681612758    |1    |
|2023|MATO GROSSO        |179559.48038375698   |1    |
+----+-------------------+---------------------+-----+
only showing top 10 rows



In [7]:
# Garante tipos numéricos das features
num_features = [
    "total_volume_t",
    "total_fob_usd",
    "total_pasture_area_ha",
    "total_co2_gross_tco2",
    "avg_forest500_beef",
    "zdc_share_volume"
]

for col_name in num_features:
    if col_name in df_ml.columns:
        df_ml = df_ml.withColumn(col_name, F.col(col_name).cast("double"))

# Index da variável categórica 'state_of_production'
state_indexer = StringIndexer(
    inputCol="state_of_production",
    outputCol="state_index"
)

# Monta vetor de features
assembler = VectorAssembler(
    inputCols=num_features + ["state_index"],
    outputCol="features"
)

# Modelo de classificação
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=200,
    maxDepth=8,
    seed=42
)

# Pipeline completo
pipeline = Pipeline(stages=[state_indexer, assembler, rf])

# Treino / teste
train_df, test_df = df_ml.randomSplit([0.8, 0.2], seed=42)

print("Tamanho treino:", train_df.count())
print("Tamanho teste:", test_df.count())


Tamanho treino: 190
Tamanho teste: 35


In [9]:
feature_cols = [
    "total_volume_t",
    "total_fob_usd",
    "total_pasture_area_ha",
    "total_co2_gross_tco2",
    "avg_forest500_beef",
    "zdc_share_volume",
]

print("Contagem de null por coluna numérica:")
for c in feature_cols:
    df_ml.select(
        F.count(F.when(F.col(c).isNull(), 1)).alias(c)
    ).show()

# também checar estado e label
df_ml.select(
    F.count(F.when(F.col("state_of_production").isNull(), 1)).alias("state_null"),
    F.count(F.when(F.col("label").isNull(), 1)).alias("label_null")
).show()


Contagem de null por coluna numérica:


                                                                                

+--------------+
|total_volume_t|
+--------------+
|             0|
+--------------+

+-------------+
|total_fob_usd|
+-------------+
|            0|
+-------------+

+---------------------+
|total_pasture_area_ha|
+---------------------+
|                   17|
+---------------------+

+--------------------+
|total_co2_gross_tco2|
+--------------------+
|                  67|
+--------------------+

+------------------+
|avg_forest500_beef|
+------------------+
|                76|
+------------------+

+----------------+
|zdc_share_volume|
+----------------+
|               0|
+----------------+

+----------+----------+
|state_null|label_null|
+----------+----------+
|         0|         0|
+----------+----------+



In [10]:
# Colunas numéricas que vão para o modelo
num_features = [
    "total_volume_t",
    "total_fob_usd",
    "total_pasture_area_ha",
    "total_co2_gross_tco2",
    "avg_forest500_beef",
    "zdc_share_volume"
]

# garantir tipos numéricos
for col_name in num_features:
    if col_name in df_ml.columns:
        df_ml = df_ml.withColumn(col_name, F.col(col_name).cast("double"))

# remover linhas com null em qualquer feature ou na categórica
cols_sem_null = num_features + ["state_of_production", "label"]

df_ml_clean = df_ml.na.drop(subset=cols_sem_null)

print("Antes:", df_ml.count(), "linhas")
print("Depois de limpar nulls:", df_ml_clean.count(), "linhas")


                                                                                

Antes: 225 linhas
Depois de limpar nulls: 134 linhas


In [12]:
model = pipeline.fit(train_df)

preds = model.transform(test_df)

preds.select(
    "state_of_production",
    "year",
    "label",
    "prediction",
    "probability"
).show(20, truncate=False)


25/11/29 21:05:04 ERROR Executor: Exception in task 0.0 in stage 57.0 (TID 40)
org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda$3933/0x00000008418a3840`: (struct<total_volume_t:double,total_fob_usd:double,total_pasture_area_ha:double,total_co2_gross_tco2:double,avg_forest500_beef:double,zdc_share_volume:double,state_index:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFa

Py4JJavaError: An error occurred while calling o108.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 57.0 failed 1 times, most recent failure: Lost task 0.0 in stage 57.0 (TID 40) (1ca62cf5066f executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda$3933/0x00000008418a3840`: (struct<total_volume_t:double,total_fob_usd:double,total_pasture_area_ha:double,total_co2_gross_tco2:double,avg_forest500_beef:double,zdc_share_volume:double,state_index:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$SliceIterator.hasNext(Iterator.scala:268)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$take$2(RDD.scala:1492)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:621)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:624)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 37 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2898)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2834)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2833)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2833)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1253)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3102)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3036)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3025)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:995)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1492)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1465)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:119)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:274)
	at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:168)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:139)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:47)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda$3933/0x00000008418a3840`: (struct<total_volume_t:double,total_fob_usd:double,total_pasture_area_ha:double,total_co2_gross_tco2:double,avg_forest500_beef:double,zdc_share_volume:double,state_index:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$SliceIterator.hasNext(Iterator.scala:268)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$take$2(RDD.scala:1492)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:621)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:624)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 37 more


In [None]:
# AUC (área sob curva ROC)
bin_eval = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction"
)

auc = bin_eval.evaluate(preds)
print("AUC:", auc)

# Acurácia
acc_eval = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = acc_eval.evaluate(preds)
print("Acurácia:", accuracy)


In [None]:
# Último estágio do pipeline é o RandomForest
rf_model = model.stages[-1]

feature_list = num_features + ["state_index"]
importances = rf_model.featureImportances.toArray()

imp_df = pd.DataFrame({
    "feature": feature_list,
    "importance": importances
}).sort_values("importance", ascending=False)

imp_df


In [None]:
plt.figure(figsize=(10, 5))
plt.bar(imp_df["feature"], imp_df["importance"])
plt.xticks(rotation=45, ha="right")
plt.title("Importância das variáveis – Random Forest (exposição ao desmatamento)")
plt.ylabel("Importância relativa")
plt.tight_layout()
plt.show()


In [None]:
ML_BASE = f"{BASE_PATH}/gold/ml_exports_by_state"

(preds
 .select(
     "year",
     "state_of_production",
     "state_of_production_trase_id",
     "total_volume_t",
     "total_cattle_defor_ha",
     "label",
     "prediction",
     "probability"
 )
 .write
 .mode("overwrite")
 .parquet(ML_BASE)
)

print(f"Resultados de ML salvos em: {ML_BASE}")
