In [2]:
import pyspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.sql.types import StringType, StructType, StructField


# Cria uma sessão Spark habilitando Hive support para armazenar dados no Spoark Warehouse
spark = SparkSession \
    .builder \
    .appName("projeto_parte_ii") \
    .config('spark.master', 'local') \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
# Spark read Hive table
spark.sql("USE projeto")

24/06/08 22:26:14 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/06/08 22:26:14 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/06/08 22:26:22 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/06/08 22:26:22 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore hadoop@127.0.1.1
24/06/08 22:26:23 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


DataFrame[]

In [4]:
df = spark.sql("SELECT * FROM notas_fiscais LIMIT 100000")
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: date (nullable = true)
 |-- nf_valor_total: decimal(15,2) (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: integer (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |-- tipologia_rural_urbana: string (nullable = true)
 |-- hierarquia_urbana: string (nullable = true)
 |-- pop: integer (nullable = true)
 |-- pib: decimal(15,2) (nullable = t

In [5]:
columns_to_keep = '''
prod_ncm, 
prod_unid_nn as prod_unid, 
scaled_log_prod_quant as prod_quant, 
scaled_log_prod_valor_unit as prod_valor_unit,
nome_mesoregiao as reg_mesoregiao, 
tipologia_rural_urbana as reg_tipologia, 
hierarquia_urbana as reg_hierarquia, 
scaled_pop_meso as reg_pop,
scaled_pib_meso as reg_pib
'''

df = spark.sql(f"SELECT {columns_to_keep} FROM notas_fiscais")
df.printSchema()

root
 |-- prod_ncm: string (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: double (nullable = true)
 |-- reg_mesoregiao: string (nullable = true)
 |-- reg_tipologia: string (nullable = true)
 |-- reg_hierarquia: string (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)



In [6]:
#df.show()

In [7]:
# prod_ncm
indexer = StringIndexer(inputCol="prod_ncm", outputCol="prod_ncm_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="prod_ncm_indexed", outputCol="prod_ncm_onehot")
df = encoder.fit(df).transform(df)

#prod_unid
indexer = StringIndexer(inputCol="prod_unid", outputCol="prod_unid_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="prod_unid_indexed", outputCol="prod_unid_onehot")
df = encoder.fit(df).transform(df)

#reg_mesoregiao
indexer = StringIndexer(inputCol="reg_mesoregiao", outputCol="reg_mesoregiao_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_mesoregiao_indexed", outputCol="reg_mesoregiao_onehot")
df = encoder.fit(df).transform(df)

#reg_tipologia
indexer = StringIndexer(inputCol="reg_tipologia", outputCol="reg_tipologia_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_tipologia_indexed", outputCol="reg_tipologia_onehot")
df = encoder.fit(df).transform(df)

#reg_hierarquia
indexer = StringIndexer(inputCol="reg_hierarquia", outputCol="reg_hierarquia_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_hierarquia_indexed", outputCol="reg_hierarquia_onehot")
df = encoder.fit(df).transform(df)

#df.show(truncate=False)

                                                                                

In [8]:
df.printSchema()

root
 |-- prod_ncm: string (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: double (nullable = true)
 |-- reg_mesoregiao: string (nullable = true)
 |-- reg_tipologia: string (nullable = true)
 |-- reg_hierarquia: string (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)
 |-- prod_ncm_indexed: double (nullable = false)
 |-- prod_ncm_onehot: vector (nullable = true)
 |-- prod_unid_indexed: double (nullable = false)
 |-- prod_unid_onehot: vector (nullable = true)
 |-- reg_mesoregiao_indexed: double (nullable = false)
 |-- reg_mesoregiao_onehot: vector (nullable = true)
 |-- reg_tipologia_indexed: double (nullable = false)
 |-- reg_tipologia_onehot: vector (nullable = true)
 |-- reg_hierarquia_indexed: double (nullable = false)
 |-- reg_hierarquia_onehot: vector (nullable = true)



In [9]:
df.createOrReplaceTempView("df")

columns_to_keep = '''
prod_ncm_onehot as prod_ncm,
prod_unid_onehot as prod_unid,
prod_quant,
prod_valor_unit,
reg_mesoregiao_onehot as reg_mesoregiao,
reg_tipologia_onehot as reg_tipologia,
reg_hierarquia_onehot as reg_hierarquia,
reg_pop,
reg_pib
'''

df = spark.sql(f"SELECT {columns_to_keep} FROM df")
df.printSchema()

root
 |-- prod_ncm: vector (nullable = true)
 |-- prod_unid: vector (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: double (nullable = true)
 |-- reg_mesoregiao: vector (nullable = true)
 |-- reg_tipologia: vector (nullable = true)
 |-- reg_hierarquia: vector (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)



24/06/08 22:27:00 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [10]:
#df.show()

In [11]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
assembler = VectorAssembler(
    inputCols=["prod_ncm", "prod_unid", "prod_quant", "reg_mesoregiao", "reg_tipologia", "reg_hierarquia", "reg_pop", "reg_pib"],
    outputCol="features")

df_lr = assembler.transform(df)
final_df = df_lr.select("features", "prod_valor_unit")

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=26)

In [13]:
linear_regressor = LinearRegression(featuresCol="features", labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_lr")
linear_regressor_model = linear_regressor.fit(train_df)

24/06/08 22:27:03 WARN DAGScheduler: Broadcasting large task binary with size 1619.6 KiB
24/06/08 22:27:35 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:00 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:01 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:01 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:02 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:02 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:03 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:03 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:03 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:03 WARN DAGScheduler: Broadcasting large task binary with size 1620.2 KiB
24/06/08 22:28:04 WAR

In [14]:
predictions_lr = linear_regressor_model.transform(test_df)

evaluator = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_lr", metricName="rmse")
rmse = evaluator.evaluate(predictions_lr)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_lr", metricName="r2")
r2 = evaluator_r2.evaluate(predictions_lr)
print("R-squared (R2) on test data: {:.3f}".format(r2))

24/06/08 22:28:43 WARN DAGScheduler: Broadcasting large task binary with size 1676.2 KiB
                                                                                

Root Mean Squared Error (RMSE) on test data: 0.560


24/06/08 22:29:05 WARN DAGScheduler: Broadcasting large task binary with size 1676.2 KiB
[Stage 78:>                                                         (0 + 1) / 1]

R-squared (R2) on test data: 0.687


                                                                                

In [33]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [34]:
assembler = VectorAssembler(
    inputCols=["prod_ncm", "prod_unid", "prod_quant", "reg_mesoregiao", "reg_tipologia", "reg_hierarquia", "reg_pop", "reg_pib"],
    outputCol="features")

df_dt = assembler.transform(df)
final_df = df_dt.select("features", "prod_valor_unit")

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=26)

In [35]:
tree_regressor = DecisionTreeRegressor(featuresCol="features", labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_tr", maxDepth=6)
tree_regressor_model = tree_regressor.fit(train_df)

24/06/08 23:41:36 WARN DAGScheduler: Broadcasting large task binary with size 1618.3 KiB
ERROR:root:KeyboardInterrupt while sending command.][Stage 104:>  (0 + 0) / 1]
Traceback (most recent call last):
  File "/home/hadoop/anaconda3/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hadoop/anaconda3/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hadoop/anaconda3/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
predictions_tr = tree_regressor_model.transform(test_df)

evaluator = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_tr", metricName="rmse")
rmse = evaluator.evaluate(predictions_tr)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_tr", metricName="r2")
r2 = evaluator_r2.evaluate(predictions_tr)
print("R-squared (R2) on test data: {:.3f}".format(r2))

In [None]:
feature_importance = tree_regressor_model.featureImportances.toArray()

# Show feature importance
for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]}")

In [26]:
coefficients = linear_regressor_model.coefficients
intercept = linear_regressor_model.intercept

feature_importance_lr = sorted(list(zip(df_lr.columns[:-1], map(abs, coefficients))), key=lambda x: x[1], reverse=True)

print("Feature Importance:")
for feature, importance in feature_importance_lr:
    print("  {}: {:.3f}".format(feature, importance))

Feature Importance:
  prod_valor_unit: 0.837
  reg_tipologia: 0.728
  reg_hierarquia: 0.406
  reg_pop: 0.395
  reg_pib: 0.344
  prod_unid: 0.260
  prod_ncm: 0.235
  prod_quant: 0.112
  reg_mesoregiao: 0.029
