In [41]:
import pyspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.sql.types import StringType, StructType, StructField


# Cria uma sessão Spark habilitando Hive support para armazenar dados no Spoark Warehouse
spark = SparkSession \
    .builder \
    .appName("projeto_parte_ii") \
    .config('spark.master', 'local') \
    .enableHiveSupport() \
    .getOrCreate()

In [42]:
# Spark read Hive table
spark.sql("USE projeto")

24/06/11 20:39:19 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/06/11 20:39:19 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


DataFrame[]

In [43]:
df = spark.sql("SELECT * FROM notas_fiscais WHERE prod_unid_nn IN ('und', 'kg', 'cx', 'pacote', 'l')")
df.createOrReplaceTempView("df")
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: date (nullable = true)
 |-- nf_valor_total: decimal(15,2) (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: integer (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- prod_valor_unit: decimal(15,2) (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |-- tipologia_rural_urbana: string (nullable = true)
 |-- hierarquia_urbana: string (nullable = true)
 |-- pop: integer (

In [44]:
columns_to_keep = '''
prod_ncm, 
prod_unid_nn as prod_unid, 
scaled_log_prod_quant as prod_quant, 
prod_valor_unit as prod_valor_unit,
nome_mesoregiao as reg_mesoregiao, 
tipologia_rural_urbana as reg_tipologia, 
hierarquia_urbana as reg_hierarquia, 
scaled_pop_meso as reg_pop,
scaled_pib_meso as reg_pib
'''

df = spark.sql(f"SELECT {columns_to_keep} FROM notas_fiscais")
df.createOrReplaceTempView("df")
df.printSchema()

root
 |-- prod_ncm: string (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: decimal(15,2) (nullable = true)
 |-- reg_mesoregiao: string (nullable = true)
 |-- reg_tipologia: string (nullable = true)
 |-- reg_hierarquia: string (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)



In [45]:
df = spark.sql(f"SELECT * FROM df WHERE prod_unid IN ('und', 'kg', 'cx', 'pacote', 'l')")
df.createOrReplaceTempView("df")
select = spark.sql("SELECT DISTINCT prod_unid FROM df")
select.show()

+---------+
|prod_unid|
+---------+
|        l|
|      und|
|       cx|
|       kg|
|   pacote|
+---------+



In [46]:
#df.show()

In [47]:
# prod_ncm
'''indexer = StringIndexer(inputCol="prod_ncm", outputCol="prod_ncm_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="prod_ncm_indexed", outputCol="prod_ncm_onehot")
df = encoder.fit(df).transform(df)'''

df = df.withColumn("prod_ncm", df["prod_ncm"].cast(IntegerType()))
df.createOrReplaceTempView("df")

df = spark.sql("SELECT * FROM df WHERE prod_ncm IS NOT NULL")
df.createOrReplaceTempView("df")

#prod_unid
indexer = StringIndexer(inputCol="prod_unid", outputCol="prod_unid_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="prod_unid_indexed", outputCol="prod_unid_onehot")
df = encoder.fit(df).transform(df)

#reg_mesoregiao
indexer = StringIndexer(inputCol="reg_mesoregiao", outputCol="reg_mesoregiao_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_mesoregiao_indexed", outputCol="reg_mesoregiao_onehot")
df = encoder.fit(df).transform(df)

#reg_tipologia
indexer = StringIndexer(inputCol="reg_tipologia", outputCol="reg_tipologia_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_tipologia_indexed", outputCol="reg_tipologia_onehot")
df = encoder.fit(df).transform(df)

#reg_hierarquia
indexer = StringIndexer(inputCol="reg_hierarquia", outputCol="reg_hierarquia_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_hierarquia_indexed", outputCol="reg_hierarquia_onehot")
df = encoder.fit(df).transform(df)

df.createOrReplaceTempView("df")

#df.show(truncate=False)

                                                                                

In [48]:
df.printSchema()

root
 |-- prod_ncm: integer (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: decimal(15,2) (nullable = true)
 |-- reg_mesoregiao: string (nullable = true)
 |-- reg_tipologia: string (nullable = true)
 |-- reg_hierarquia: string (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)
 |-- prod_unid_indexed: double (nullable = false)
 |-- prod_unid_onehot: vector (nullable = true)
 |-- reg_mesoregiao_indexed: double (nullable = false)
 |-- reg_mesoregiao_onehot: vector (nullable = true)
 |-- reg_tipologia_indexed: double (nullable = false)
 |-- reg_tipologia_onehot: vector (nullable = true)
 |-- reg_hierarquia_indexed: double (nullable = false)
 |-- reg_hierarquia_onehot: vector (nullable = true)



In [49]:
df.createOrReplaceTempView("df")

columns_to_keep = '''
prod_ncm,
prod_unid_onehot as prod_unid,
prod_quant,
prod_valor_unit,
reg_mesoregiao_onehot as reg_mesoregiao,
reg_tipologia_onehot as reg_tipologia,
reg_hierarquia_onehot as reg_hierarquia,
reg_pop,
reg_pib
'''

df = spark.sql(f"SELECT {columns_to_keep} FROM df")
df.printSchema()

root
 |-- prod_ncm: integer (nullable = true)
 |-- prod_unid: vector (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: decimal(15,2) (nullable = true)
 |-- reg_mesoregiao: vector (nullable = true)
 |-- reg_tipologia: vector (nullable = true)
 |-- reg_hierarquia: vector (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)



In [50]:
df.show()

+--------+-------------+--------------------+---------------+--------------+-------------+--------------+-------------------+-------------------+
|prod_ncm|    prod_unid|          prod_quant|prod_valor_unit|reg_mesoregiao|reg_tipologia|reg_hierarquia|            reg_pop|            reg_pib|
+--------+-------------+--------------------+---------------+--------------+-------------+--------------+-------------------+-------------------+
|30049059|(4,[0],[1.0])| -0.8904793211295619|          47.84|(36,[5],[1.0])|    (2,[],[])|(13,[0],[1.0])|-1.2079660195805064|-1.6211221394492155|
|30049059|(4,[0],[1.0])| -0.8904793211295619|          47.84|(36,[5],[1.0])|    (2,[],[])|(13,[0],[1.0])|-1.2079660195805064|-1.6211221394492155|
|30049059|(4,[0],[1.0])|-0.25442384791357503|          13.83|(36,[5],[1.0])|    (2,[],[])|(13,[0],[1.0])|-1.2079660195805064|-1.6211221394492155|
|21069010|(4,[0],[1.0])|   0.850702099178969|           1.18|(36,[5],[1.0])|(2,[0],[1.0])|(13,[0],[1.0])|-1.2079660195805064

In [51]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [52]:
assembler = VectorAssembler(
    inputCols=["prod_ncm", "prod_unid", "prod_quant", "reg_mesoregiao", "reg_tipologia", "reg_hierarquia", "reg_pop", "reg_pib"],
    outputCol="features")

df_lr = assembler.transform(df)
final_df = df_lr.select("features", "prod_valor_unit")

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=26)

In [53]:
linear_regressor = LinearRegression(featuresCol="features", labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_lr")
linear_regressor_model = linear_regressor.fit(train_df)

24/06/11 20:39:30 WARN Instrumentation: [879ecefb] regParam is zero, which might cause numerical instability and overfitting.
24/06/11 20:39:47 WARN Instrumentation: [879ecefb] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [54]:
predictions_lr = linear_regressor_model.transform(test_df)

evaluator = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_lr", metricName="rmse")
rmse = evaluator.evaluate(predictions_lr)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_lr", metricName="r2")
r2 = evaluator_r2.evaluate(predictions_lr)
print("R-squared (R2) on test data: {:.3f}".format(r2))

                                                                                

Root Mean Squared Error (RMSE) on test data: 1841.873


[Stage 37:>                                                         (0 + 1) / 1]

R-squared (R2) on test data: -0.067


                                                                                

In [55]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [56]:
assembler = VectorAssembler(
    inputCols=["prod_ncm", "prod_unid", "prod_quant", "reg_mesoregiao", "reg_tipologia", "reg_hierarquia", "reg_pop", "reg_pib"],
    outputCol="features")

df_dt = assembler.transform(df)
final_df = df_dt.select("features", "prod_valor_unit")

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=26)

In [57]:
tree_regressor = DecisionTreeRegressor(featuresCol="features", labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_tr", maxDepth=6)
tree_regressor_model = tree_regressor.fit(train_df)

24/06/11 20:41:26 WARN MemoryStore: Not enough space to cache rdd_165_0 in memory! (computed 228.0 MiB so far)
24/06/11 20:41:26 WARN BlockManager: Persisting block rdd_165_0 to disk instead.
24/06/11 20:41:32 WARN MemoryStore: Not enough space to cache rdd_165_0 in memory! (computed 333.4 MiB so far)
24/06/11 20:41:37 WARN MemoryStore: Not enough space to cache rdd_165_0 in memory! (computed 333.4 MiB so far)
24/06/11 20:41:40 WARN MemoryStore: Not enough space to cache rdd_165_0 in memory! (computed 333.4 MiB so far)
24/06/11 20:41:43 WARN MemoryStore: Not enough space to cache rdd_165_0 in memory! (computed 333.4 MiB so far)
24/06/11 20:41:47 WARN MemoryStore: Not enough space to cache rdd_165_0 in memory! (computed 333.4 MiB so far)
24/06/11 20:41:51 WARN MemoryStore: Not enough space to cache rdd_165_0 in memory! (computed 333.4 MiB so far)
                                                                                

In [58]:
predictions_tr = tree_regressor_model.transform(test_df)

evaluator = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_tr", metricName="rmse")
rmse = evaluator.evaluate(predictions_tr)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_tr", metricName="r2")
r2 = evaluator_r2.evaluate(predictions_tr)
print("R-squared (R2) on test data: {:.3f}".format(r2))

                                                                                

Root Mean Squared Error (RMSE) on test data: 1745.492


[Stage 55:>                                                         (0 + 1) / 1]

R-squared (R2) on test data: 0.041


                                                                                

In [59]:
feature_importance = tree_regressor_model.featureImportances.toArray()

# Show feature importance
for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]}")

Feature 'prod_ncm': 0.30816962196917363
Feature 'prod_unid': 0.38948853011610995
Feature 'prod_quant': 0.0
Feature 'reg_mesoregiao': 3.90186018602796e-09
Feature 'reg_tipologia': 6.3373854783343774e-09
Feature 'reg_hierarquia': 0.00588278450807791
Feature 'reg_pop': 0.0
Feature 'reg_pib': 0.0


In [60]:
coefficients = linear_regressor_model.coefficients
intercept = linear_regressor_model.intercept

feature_importance_lr = sorted(list(zip(df_lr.columns[:-1], map(abs, coefficients))), key=lambda x: x[1], reverse=True)

print("Feature Importance:")
for feature, importance in feature_importance_lr:
    print("  {}: {:.3f}".format(feature, importance))

Feature Importance:
  prod_valor_unit: 404.825
  reg_mesoregiao: 299.325
  prod_quant: 196.284
  reg_pop: 182.820
  reg_tipologia: 170.132
  reg_pib: 140.004
  prod_unid: 77.906
  reg_hierarquia: 37.121
  prod_ncm: 0.000


In [67]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [68]:
assembler = VectorAssembler(
    inputCols=["prod_ncm", "prod_unid", "prod_quant", "reg_mesoregiao", "reg_tipologia", "reg_hierarquia", "reg_pop", "reg_pib"],
    outputCol="features")

df_rf = assembler.transform(df)
final_df = df_rf.select("features", "prod_valor_unit")

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=26)

In [69]:
forest_regressor = RandomForestRegressor(featuresCol="features", labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_fr", maxDepth=6, numTrees=30)
forest_regressor_model = forest_regressor.fit(train_df)

24/06/11 20:51:51 WARN MemoryStore: Not enough space to cache rdd_266_0 in memory! (computed 215.0 MiB so far)
24/06/11 20:51:51 WARN BlockManager: Persisting block rdd_266_0 to disk instead.
24/06/11 20:52:02 WARN MemoryStore: Not enough space to cache rdd_266_0 in memory! (computed 323.2 MiB so far)
24/06/11 20:52:16 WARN MemoryStore: Not enough space to cache rdd_266_0 in memory! (computed 323.2 MiB so far)
24/06/11 20:52:30 WARN MemoryStore: Not enough space to cache rdd_266_0 in memory! (computed 323.2 MiB so far)
24/06/11 20:52:48 WARN MemoryStore: Not enough space to cache rdd_266_0 in memory! (computed 323.2 MiB so far)
24/06/11 20:53:05 WARN MemoryStore: Not enough space to cache rdd_266_0 in memory! (computed 323.2 MiB so far)
24/06/11 20:53:24 WARN MemoryStore: Not enough space to cache rdd_266_0 in memory! (computed 323.2 MiB so far)
                                                                                

In [70]:
predictions_fr = forest_regressor_model.transform(test_df)

evaluator = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_fr", metricName="rmse")
rmse = evaluator.evaluate(predictions_fr)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="prod_valor_unit", predictionCol="predicted_prod_valor_unit_fr", metricName="r2")
r2 = evaluator_r2.evaluate(predictions_fr)
print("R-squared (R2) on test data: {:.3f}".format(r2))

                                                                                

Root Mean Squared Error (RMSE) on test data: 2144.082


[Stage 89:>                                                         (0 + 1) / 1]

R-squared (R2) on test data: -0.446


                                                                                

In [65]:
feature_importance = forest_regressor_model.featureImportances.toArray()

# Show feature importance
for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]}")

Feature 'prod_ncm': 0.24918984233367159
Feature 'prod_unid': 0.12955840114910877
Feature 'prod_quant': 0.001192668422601933
Feature 'reg_mesoregiao': 0.006460549498743054
Feature 'reg_tipologia': 0.01868965460921217
Feature 'reg_hierarquia': 0.26186274276227167
Feature 'reg_pop': 1.411897577173017e-06
Feature 'reg_pib': 3.773560017182131e-05


In [66]:
#spark.stop()