In [29]:
import pyspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.sql.types import StringType, StructType, StructField


# Cria uma sessão Spark habilitando Hive support para armazenar dados no Spoark Warehouse
spark = SparkSession \
    .builder \
    .appName("projeto_parte_ii") \
    .config('spark.master', 'local') \
    .enableHiveSupport() \
    .getOrCreate()

In [30]:
# Spark read Hive table
spark.sql("USE projeto")

DataFrame[]

In [31]:
df = spark.sql("SELECT * FROM notas_fiscais")
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: date (nullable = true)
 |-- nf_valor_total: decimal(15,2) (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: integer (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |-- tipologia_rural_urbana: string (nullable = true)
 |-- hierarquia_urbana: string (nullable = true)
 |-- pop: integer (nullable = true)
 |-- pib: decimal(15,2) (nullable = t

In [32]:
columns_to_keep = '''
prod_ncm, 
prod_unid_nn as prod_unid, 
scaled_log_prod_quant as prod_quant, 
scaled_log_prod_valor_unit as prod_valor_unit,
nome_mesoregiao as reg_mesoregiao, 
tipologia_rural_urbana as reg_tipologia, 
hierarquia_urbana as reg_hierarquia, 
scaled_pop_meso as reg_pop,
scaled_pib_meso as reg_pib
'''

df = spark.sql(f"SELECT {columns_to_keep} FROM notas_fiscais")
df.printSchema()

root
 |-- prod_ncm: string (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: double (nullable = true)
 |-- reg_mesoregiao: string (nullable = true)
 |-- reg_tipologia: string (nullable = true)
 |-- reg_hierarquia: string (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)



In [33]:
df.show()

+--------+---------+--------------------+-------------------+-----------------+--------------------+--------------+-------------------+-------------------+
|prod_ncm|prod_unid|          prod_quant|    prod_valor_unit|   reg_mesoregiao|       reg_tipologia|reg_hierarquia|            reg_pop|            reg_pib|
+--------+---------+--------------------+-------------------+-----------------+--------------------+--------------+-------------------+-------------------+
|30049059|      und| -0.8904793211295619| 0.8915553517996679|centro sul baiano|intermediario adj...|  centro local|-1.2079660195805064|-1.6211221394492155|
|30049059|      amp|  1.0627296361867002|-1.0428350252898675|centro sul baiano|              urbano|  centro local|-1.2079660195805064|-1.6211221394492155|
|30049059|      und| -0.8904793211295619| 0.8915553517996679|centro sul baiano|intermediario adj...|  centro local|-1.2079660195805064|-1.6211221394492155|
|30049059|     comp|  0.9672821640983654|-1.3533268043906668|cen

In [34]:
# prod_ncm
indexer = StringIndexer(inputCol="prod_ncm", outputCol="prod_ncm_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="prod_ncm_indexed", outputCol="prod_ncm_onehot")
df = encoder.fit(df).transform(df)

#prod_unid
indexer = StringIndexer(inputCol="prod_unid", outputCol="prod_unid_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="prod_unid_indexed", outputCol="prod_unid_onehot")
df = encoder.fit(df).transform(df)

#reg_mesoregiao
indexer = StringIndexer(inputCol="reg_mesoregiao", outputCol="reg_mesoregiao_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_mesoregiao_indexed", outputCol="reg_mesoregiao_onehot")
df = encoder.fit(df).transform(df)

#reg_tipologia
indexer = StringIndexer(inputCol="reg_tipologia", outputCol="reg_tipologia_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_tipologia_indexed", outputCol="reg_tipologia_onehot")
df = encoder.fit(df).transform(df)

#reg_hierarquia
indexer = StringIndexer(inputCol="reg_hierarquia", outputCol="reg_hierarquia_indexed")
df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol="reg_hierarquia_indexed", outputCol="reg_hierarquia_onehot")
df = encoder.fit(df).transform(df)

df.show(truncate=False)

                                                                                

+--------+---------+--------------------+-------------------+-----------------+-----------------------+--------------+-------------------+-------------------+----------------+-------------------+-----------------+-----------------+----------------------+---------------------+---------------------+--------------------+----------------------+---------------------+
|prod_ncm|prod_unid|prod_quant          |prod_valor_unit    |reg_mesoregiao   |reg_tipologia          |reg_hierarquia|reg_pop            |reg_pib            |prod_ncm_indexed|prod_ncm_onehot    |prod_unid_indexed|prod_unid_onehot |reg_mesoregiao_indexed|reg_mesoregiao_onehot|reg_tipologia_indexed|reg_tipologia_onehot|reg_hierarquia_indexed|reg_hierarquia_onehot|
+--------+---------+--------------------+-------------------+-----------------+-----------------------+--------------+-------------------+-------------------+----------------+-------------------+-----------------+-----------------+----------------------+----------------

In [35]:
df.printSchema()

root
 |-- prod_ncm: string (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: double (nullable = true)
 |-- reg_mesoregiao: string (nullable = true)
 |-- reg_tipologia: string (nullable = true)
 |-- reg_hierarquia: string (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)
 |-- prod_ncm_indexed: double (nullable = false)
 |-- prod_ncm_onehot: vector (nullable = true)
 |-- prod_unid_indexed: double (nullable = false)
 |-- prod_unid_onehot: vector (nullable = true)
 |-- reg_mesoregiao_indexed: double (nullable = false)
 |-- reg_mesoregiao_onehot: vector (nullable = true)
 |-- reg_tipologia_indexed: double (nullable = false)
 |-- reg_tipologia_onehot: vector (nullable = true)
 |-- reg_hierarquia_indexed: double (nullable = false)
 |-- reg_hierarquia_onehot: vector (nullable = true)



In [36]:
df.createOrReplaceTempView("df")

columns_to_keep = '''
prod_ncm_onehot as prod_ncm,
prod_unid_onehot as prod_unid,
prod_quant,
prod_valor_unit,
reg_mesoregiao_onehot as reg_mesoregiao,
reg_tipologia_onehot as reg_tipologia,
reg_hierarquia_onehot as reg_hierarquia,
reg_pop,
reg_pib
'''

df = spark.sql(f"SELECT {columns_to_keep} FROM df")
df.printSchema()

root
 |-- prod_ncm: vector (nullable = true)
 |-- prod_unid: vector (nullable = true)
 |-- prod_quant: double (nullable = true)
 |-- prod_valor_unit: double (nullable = true)
 |-- reg_mesoregiao: vector (nullable = true)
 |-- reg_tipologia: vector (nullable = true)
 |-- reg_hierarquia: vector (nullable = true)
 |-- reg_pop: double (nullable = true)
 |-- reg_pib: double (nullable = true)



24/06/07 22:13:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [37]:
df.show()

[Stage 46:>                                                         (0 + 1) / 1]

+-------------------+-----------------+--------------------+-------------------+--------------+-------------+--------------+-------------------+-------------------+
|           prod_ncm|        prod_unid|          prod_quant|    prod_valor_unit|reg_mesoregiao|reg_tipologia|reg_hierarquia|            reg_pop|            reg_pib|
+-------------------+-----------------+--------------------+-------------------+--------------+-------------+--------------+-------------------+-------------------+
|  (6004,[68],[1.0])| (2698,[0],[1.0])| -0.8904793211295619| 0.8915553517996679|(36,[6],[1.0])|    (2,[],[])|(13,[0],[1.0])|-1.2079660195805064|-1.6211221394492155|
|  (6004,[68],[1.0])| (2698,[6],[1.0])|  1.0627296361867002|-1.0428350252898675|(36,[6],[1.0])|(2,[0],[1.0])|(13,[0],[1.0])|-1.2079660195805064|-1.6211221394492155|
|  (6004,[68],[1.0])| (2698,[0],[1.0])| -0.8904793211295619| 0.8915553517996679|(36,[6],[1.0])|    (2,[],[])|(13,[0],[1.0])|-1.2079660195805064|-1.6211221394492155|
|  (6004,[

                                                                                