In [30]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

spark = SparkSession \
    .builder \
    .appName("projeto") \
    .config('spark.master', 'local') \
    .config("spark.jars", "/home/hadoop/Desktop/projeto1/code/BigData/part_1/postgresql-42.7.3.jar") \
    .enableHiveSupport() \
    .getOrCreate()

dfpib = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/projeto") \
    .option("dbtable", "pib_municipios") \
    .option("user", "hadoop") \
    .option("password", "bigdata") \
    .option("driver", "org.postgresql.Driver") \
    .load()

dfpib.printSchema()

root
 |-- ano: integer (nullable = true)
 |-- codigo_regiao: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- codigo_uf: integer (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_uf: string (nullable = true)
 |-- codigo_municipio: integer (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |-- tipologia_rural_urbana: string (nullable = true)
 |-- hierarquia_urbana: string (nullable = true)
 |-- valor_adicionado_agro: integer (nullable = true)
 |-- valor_adicionado_industria: integer (nullable = true)
 |-- valor_adicionado_servico: integer (nullable = true)
 |-- valor_adicionado_adm: integer (nullable = true)
 |-- valor_adicionado_total: integer (nullable = true)
 |-- pib_concorrentes: integer (nullable = true)
 |-- pop: integer (nullable = true)
 |-- pib: string (nullable = true)



In [31]:
dfnf = spark.read.csv("../../../datasets/notas_fiscais.csv", header=True, sep=',')
dfnf.printSchema()

                                                                                

root
 |-- Numero: string (nullable = true)
 |-- Data_de_emissao: string (nullable = true)
 |-- Situacao: string (nullable = true)
 |-- Valor_total_da_nota: string (nullable = true)
 |-- Nota_referenciada: string (nullable = true)
 |-- Nome_razao_social_emit: string (nullable = true)
 |-- CPF_CNPJ_emit: string (nullable = true)
 |-- Endereco_emit: string (nullable = true)
 |-- Bairro_distrito_emit: string (nullable = true)
 |-- CEP_emit: string (nullable = true)
 |-- Municipio_emit: string (nullable = true)
 |-- Nome_razao_social_dest: string (nullable = true)
 |-- CPF_CNPJ_dest: string (nullable = true)
 |-- Endereco_dest: string (nullable = true)
 |-- Bairro_distrito_dest: string (nullable = true)
 |-- CEP_dest: string (nullable = true)
 |-- Municipio_dest: string (nullable = true)
 |-- Base_de_Calculo_do_ICMS: string (nullable = true)
 |-- Valor_do_ICMS: string (nullable = true)
 |-- Base_de_calculo_do_ICMS_substituicao: string (nullable = true)
 |-- Valor_do_ICMS_substituicao: strin

In [32]:
def get_columns_to_drop(all_columns, columns_to_keep):
    return [column for column in all_columns if column not in columns_to_keep]

In [33]:
dfnf_columns_to_keep = [
'Numero',
'Data_de_emissao',
'Valor_total_da_nota',
'CPF_CNPJ_emit',
'CEP_emit',
'Municipio_emit',
'CPF_CNPJ_dest',
'CEP_dest',
'Municipio_dest',
'Nr_item',
'Cod_prod',
'Descricao_do_Produto_ou_servicos',
'NCM_prod',
'Quant_prod',
'Valor_unit_prod',
'Valor_total_prod',
'Unid_prod'
]

dfnf_all_columns = dfnf.columns

dfnf_columns_to_drop = get_columns_to_drop(dfnf_all_columns, dfnf_columns_to_keep)

dfnf_columns_to_drop

['Situacao',
 'Nota_referenciada',
 'Nome_razao_social_emit',
 'Endereco_emit',
 'Bairro_distrito_emit',
 'Nome_razao_social_dest',
 'Endereco_dest',
 'Bairro_distrito_dest',
 'Base_de_Calculo_do_ICMS',
 'Valor_do_ICMS',
 'Base_de_calculo_do_ICMS_substituicao',
 'Valor_do_ICMS_substituicao',
 'Valor_total_dos_produtos',
 'Valor_do_frete',
 'Valor_do_seguro',
 'Valor_desconto',
 'Valor_outras_despesas_acessorias',
 'Valor_do_IPI',
 'Valor_total_ICMS_UF_dest',
 'Valor_total_ICMS_UF_remet',
 'Valor_BC_ICMS_UF_dest',
 'Aliquota_interna_UF_dest',
 'Aliquota_interestadual_UF_env',
 'Perc_prov_partilha_UF',
 'Perc_ICMS_FCP_UF_dest',
 'Valor_ICMS_FCP_UF_dest',
 'Valor_ICMS_partilha_UF_dest',
 'Valor_ICMS_partilha_UF_remet',
 'CST_prod',
 'CFOP_prod',
 'Valor_desconto_item',
 'BC_ICMS_prod',
 'Valor_ICMS_prod',
 'Aliq_ICMS_prod',
 'BC_ICMS_ST_prod',
 'Valor_ICMS_ST_prod',
 'Aliq_ICMS_ST_prod',
 'Valor_IPI_prod',
 'Aliq_IPI_prod',
 'Valor_PMC_prod',
 'Cod_EAN',
 'Info_Adicional_Item',
 'Informac

In [34]:
dfpib_columns_to_keep = [
	'ano',
	'nome_regiao',
	'sigla_uf',
	'nome_municipio',
	'nome_mesoregiao',
	'nome_microregiao',
	'tipologia_rural_urbana',
	'hierarquia_urbana',
	'pop',
	'pib',
]

dfpib_all_columns = dfpib.columns

dfpib_columns_to_drop = get_columns_to_drop(dfpib_all_columns, dfpib_columns_to_keep)

dfpib_columns_to_drop

['codigo_regiao',
 'codigo_uf',
 'nome_uf',
 'codigo_municipio',
 'valor_adicionado_agro',
 'valor_adicionado_industria',
 'valor_adicionado_servico',
 'valor_adicionado_adm',
 'valor_adicionado_total',
 'pib_concorrentes']

In [35]:
dfnf = dfnf.drop(*dfnf_columns_to_drop)
dfnf.columns

['Numero',
 'Data_de_emissao',
 'Valor_total_da_nota',
 'CPF_CNPJ_emit',
 'CEP_emit',
 'Municipio_emit',
 'CPF_CNPJ_dest',
 'CEP_dest',
 'Municipio_dest',
 'Nr_item',
 'Cod_prod',
 'Descricao_do_Produto_ou_servicos',
 'NCM_prod',
 'Quant_prod',
 'Valor_unit_prod',
 'Valor_total_prod',
 'Unid_prod']

In [36]:
dfpib = dfpib.drop(*dfpib_columns_to_drop)
dfpib.columns

['ano',
 'nome_regiao',
 'sigla_uf',
 'nome_municipio',
 'nome_mesoregiao',
 'nome_microregiao',
 'tipologia_rural_urbana',
 'hierarquia_urbana',
 'pop',
 'pib']

In [37]:
dfnf_columns_to_rename = [
'nf_numero',
'nf_data_emissao',
'nf_valor_total',
'emit_cnpj',
'emit_cep',
'emit_municipio',
'dest_cnpj',
'dest_cep',
'dest_municipio',
'prod_nr_item',
'prod_cod',
'prod_desc',
'prod_ncm',
'prod_quant',
'prod_valor_unit',
'prod_valor_total',
'prod_unid'
]

dfnf = dfnf.toDF(*dfnf_columns_to_rename)

dfnf.columns

['nf_numero',
 'nf_data_emissao',
 'nf_valor_total',
 'emit_cnpj',
 'emit_cep',
 'emit_municipio',
 'dest_cnpj',
 'dest_cep',
 'dest_municipio',
 'prod_nr_item',
 'prod_cod',
 'prod_desc',
 'prod_ncm',
 'prod_quant',
 'prod_valor_unit',
 'prod_valor_total',
 'prod_unid']

In [38]:
dfnf.createOrReplaceTempView("dfnf")
dfpib.createOrReplaceTempView("dfpib")

In [39]:
df_select = spark.sql("SELECT emit_municipio, dest_municipio FROM dfnf")
#df_select.show()

In [40]:
dfnf = dfnf.withColumn("emit_municipio", F.lower(dfnf["emit_municipio"]))
dfnf = dfnf.withColumn("dest_municipio", F.lower(dfnf["dest_municipio"]))

dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"ã", "a"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"á", "a"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"à", "a"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"ê", "e"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"é", "e"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"í", "i"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"ó", "o"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"õ", "o"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"ú", "u"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"ç", "c"))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"-", " "))
dfnf = dfnf.withColumn("emit_municipio", F.regexp_replace(dfnf["emit_municipio"], r"'", " "))

dfnf = dfnf.withColumn("emit_municipio", F.trim(dfnf["emit_municipio"]))

dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"ã", "a"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"á", "a"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"à", "a"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"ê", "e"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"é", "e"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"í", "i"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"ó", "o"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"õ", "o"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"ú", "u"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"ç", "c"))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"-", " "))
dfnf = dfnf.withColumn("dest_municipio", F.regexp_replace(dfnf["dest_municipio"], r"'", " "))

dfnf = dfnf.withColumn("emit_municipio", F.trim(dfnf["emit_municipio"]))

dfnf.createOrReplaceTempView("dfnf")
df_select = spark.sql("SELECT emit_municipio, dest_municipio FROM dfnf")
#df_select.show()

In [41]:
dfpib = dfpib.withColumn("nome_municipio", F.lower(dfpib["nome_municipio"]))

dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"ã", "a"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"á", "a"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"à", "a"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"ê", "e"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"é", "e"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"í", "i"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"ó", "o"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"õ", "o"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"ú", "u"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"ç", "c"))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"-", " "))
dfpib = dfpib.withColumn("nome_municipio", F.regexp_replace(dfpib["nome_municipio"], r"'", " "))

dfpib = dfpib.withColumn("nome_municipio", F.trim(dfpib["nome_municipio"]))

dfpib.createOrReplaceTempView("dfpib")
df_select = spark.sql("SELECT nome_municipio FROM dfpib")
#df_select.show()

In [42]:
dfpib = spark.sql("SELECT * FROM dfpib WHERE ano = 2016")

In [43]:
df = dfnf.join(dfpib, dfnf["emit_municipio"] == dfpib["nome_municipio"], how="left")
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: string (nullable = true)
 |-- nf_valor_total: string (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: string (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- prod_quant: string (nullable = true)
 |-- prod_valor_unit: string (nullable = true)
 |-- prod_valor_total: string (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |--

In [44]:
df.createOrReplaceTempView("df")
df_select = spark.sql("SELECT emit_municipio, COUNT(*) FROM df WHERE nome_municipio IS NULL AND emit_municipio IS NOT NULL GROUP BY emit_municipio")
#df_select.show()

[Stage 7:>                                                          (0 + 1) / 3]

In [45]:
#df_select.write.format("csv").option("header", True).save("missing_cities_2.csv")

In [46]:
df.createOrReplaceTempView("df")
df = spark.sql("SELECT * FROM df WHERE nome_municipio IS NOT NULL AND emit_municipio IS NOT NULL")
df.createOrReplaceTempView("df")

In [47]:
#df.write.format("csv").option("header", True).save("df_join.csv")

In [48]:
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: string (nullable = true)
 |-- nf_valor_total: string (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: string (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- prod_quant: string (nullable = true)
 |-- prod_valor_unit: string (nullable = true)
 |-- prod_valor_total: string (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |--

In [49]:
#df.show()

In [50]:
from pyspark.sql.types import *
df = df.withColumn("nf_data_emissao", F.to_date(df["nf_data_emissao"]))
df = df.withColumn("nf_valor_total", F.col("nf_valor_total").cast(DecimalType(15,2)))
df = df.withColumn("prod_nr_item", F.col("prod_nr_item").cast(IntegerType()))
df = df.withColumn("prod_quant", F.col("prod_quant").cast(DecimalType(15,2)))
df = df.withColumn("prod_valor_unit", F.col("prod_valor_unit").cast(DecimalType(15,2)))
df = df.withColumn("prod_valor_total", F.col("prod_valor_total").cast(DecimalType(15,2)))
df = df.withColumn("pib", F.col("pib").cast(DecimalType(15,2)))
df.createOrReplaceTempView("df")

In [51]:
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: date (nullable = true)
 |-- nf_valor_total: decimal(15,2) (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: integer (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- prod_quant: decimal(15,2) (nullable = true)
 |-- prod_valor_unit: decimal(15,2) (nullable = true)
 |-- prod_valor_total: decimal(15,2) (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: st

In [52]:
#df.show()

In [53]:
df = spark.sql("SELECT DISTINCT * FROM df")
df.createOrReplaceTempView("df")
#df.show()

In [54]:
df_select = spark.sql("SELECT nome_regiao, COUNT(*) FROM df GROUP BY nome_regiao")
#df_select.show()

In [55]:
df = spark.sql("SELECT * FROM df WHERE nome_regiao='Nordeste'")
df.createOrReplaceTempView("df")

In [56]:
#df.write.format("csv").option("header", True).save("df_join.csv")

In [57]:
df = spark.sql("SELECT *, SUM(pop) OVER (PARTITION BY nome_mesoregiao) AS pop_meso, SUM(pib) OVER (PARTITION BY nome_mesoregiao) AS pib_meso FROM df")
df.createOrReplaceTempView("df")

In [58]:
df = df.withColumn("prod_desc", F.lower(df["prod_desc"]))
df = df.withColumn("prod_unid", F.lower(df["prod_unid"]))
df = df.withColumn("nome_regiao", F.lower(df["nome_regiao"]))
df = df.withColumn("nome_mesoregiao", F.lower(df["nome_mesoregiao"]))
df = df.withColumn("nome_microregiao", F.lower(df["nome_microregiao"]))
df = df.withColumn("tipologia_rural_urbana", F.lower(df["tipologia_rural_urbana"]))
df = df.withColumn("hierarquia_urbana", F.lower(df["hierarquia_urbana"]))

df = df.withColumn("prod_desc", F.trim(df["prod_desc"]))
df = df.withColumn("prod_unid", F.trim(df["prod_unid"]))
df = df.withColumn("nome_regiao", F.trim(df["nome_regiao"]))
df = df.withColumn("nome_mesoregiao", F.trim(df["nome_mesoregiao"]))
df = df.withColumn("nome_microregiao", F.trim(df["nome_microregiao"]))
df = df.withColumn("tipologia_rural_urbana", F.trim(df["tipologia_rural_urbana"]))
df = df.withColumn("hierarquia_urbana", F.trim(df["hierarquia_urbana"]))
df = df.withColumn("nf_numero", F.trim(df["nf_numero"]))
df = df.withColumn("emit_cnpj", F.trim(df["emit_cnpj"]))
df = df.withColumn("emit_cep", F.trim(df["emit_cep"]))
df = df.withColumn("dest_cnpj", F.trim(df["dest_cnpj"]))
df = df.withColumn("dest_cep", F.trim(df["dest_cep"]))
df = df.withColumn("prod_cod", F.trim(df["prod_cod"]))
df = df.withColumn("prod_ncm", F.trim(df["prod_ncm"]))
df = df.withColumn("dest_cnpj", F.trim(df["dest_cnpj"]))
df = df.withColumn("dest_cep", F.trim(df["dest_cep"]))
df = df.withColumn("sigla_uf", F.trim(df["sigla_uf"]))
df = df.withColumn("dest_cep", F.trim(df["dest_cep"]))

df = df.withColumn("log_prod_quant", F.log1p(F.col("prod_quant")))
df = df.withColumn("log_prod_valor_unit", F.log1p(F.col("prod_valor_unit")))

df.createOrReplaceTempView("df")
mean, sttdev = df.select(F.mean("log_prod_quant"), F.stddev("log_prod_quant")).first()
df = df.withColumn("scaled_log_prod_quant", (F.col("log_prod_quant") - mean) / sttdev)

df.createOrReplaceTempView("df")
mean, sttdev = df.select(F.mean("log_prod_valor_unit"), F.stddev("log_prod_valor_unit")).first()
df = df.withColumn("scaled_log_prod_valor_unit", (F.col("log_prod_valor_unit") - mean) / sttdev)

df.createOrReplaceTempView("df")
mean, sttdev = df.select(F.mean("pop"), F.stddev("pop")).first()
df = df.withColumn("scaled_pop", (F.col("pop") - mean) / sttdev)

df.createOrReplaceTempView("df")
mean, sttdev = df.select(F.mean("pib"), F.stddev("pib")).first()
df = df.withColumn("scaled_pib", (F.col("pib") - mean) / sttdev)

df.createOrReplaceTempView("df")

24/05/26 19:21:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:21:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:21:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:21:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:22:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:22:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:22:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:22:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:22:17 WARN RowBasedKeyValueBatch: Calling spill() on

In [59]:
string_columns = [
    'nf_numero',
    'nf_data_emissao',
    'emit_cnpj',
    'emit_cep',
    'emit_municipio',
    'dest_cnpj',
    'dest_cep',
    'dest_municipio',
    'prod_cod',
    'prod_desc',
    'prod_ncm',
    'prod_unid',
    'nome_regiao',
    'sigla_uf',
    'nome_municipio',
    'nome_mesoregiao',
    'nome_microregiao',
    'tipologia_rural_urbana',
    'hierarquia_urbana',
]

number_columns = [
    'nf_valor_total',
    'prod_nr_item',
    'prod_quant',
    'prod_valor_unit',
    'prod_valor_total',
    'ano',
    'pop',
    'pib',
    'scaled_log_prod_quant',
    'scaled_log_prod_valor_unit',
    'scaled_pop',
    'scaled_pib',
]

df_strings_null = df.select([F.count(F.when(F.col(c).contains('None') | \
                            F.col(c).contains('NULL') | \
                            (F.col(c) == '' ) | \
                            F.col(c).isNull(), c 
                           )).alias(c)
                    for c in string_columns])

df_numbers_null = df.select([F.count(F.when(F.col(c).contains('None') | \
                            F.col(c).contains('NULL') | \
                            (F.col(c) == '' ) | \
                            F.col(c).isNull() | \
                            F.isnan(c), c 
                           )).alias(c)
                    for c in number_columns])

In [60]:
#df_strings_null.show()

In [61]:
#df_numbers_null.show()

In [62]:
df_select = spark.sql("SELECT * FROM df LIMIT 5")
#df.show()

In [63]:
df_select = spark.sql("SELECT prod_desc FROM df WHERE prod_quant IS NULL LIMIT 20")
#df_select.show()

In [64]:
df_select = spark.sql("SELECT prod_unid, COUNT(*) FROM df GROUP BY prod_unid ORDER BY COUNT(*) DESC")
#df_select.show()

In [65]:
df = spark.sql("SELECT *, CASE WHEN prod_unid IS NOT NULL THEN prod_unid ELSE 'UND' END prod_unid_nn FROM df")
df.createOrReplaceTempView("df")

In [66]:
df = spark.sql("SELECT *, CASE WHEN prod_quant IS NOT NULL THEN prod_quant ELSE (SELECT AVG(prod_quant) FROM df df2 WHERE df2.prod_ncm = df1.prod_ncm) END prod_quant_nn FROM df df1")
df.createOrReplaceTempView("df")

In [67]:
df = spark.sql("SELECT *, CASE WHEN prod_valor_unit IS NOT NULL THEN prod_valor_unit ELSE (SELECT AVG(prod_valor_unit ) FROM df df2 WHERE df2.prod_ncm = df1.prod_ncm) END prod_valor_unit_nn FROM df df1")
df.createOrReplaceTempView("df")

In [68]:
df = spark.sql("SELECT *, prod_quant_nn * prod_valor_unit_nn as prod_valor_total_nn FROM df")
df.createOrReplaceTempView("df")
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: date (nullable = true)
 |-- nf_valor_total: decimal(15,2) (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: integer (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- prod_quant: decimal(15,2) (nullable = true)
 |-- prod_valor_unit: decimal(15,2) (nullable = true)
 |-- prod_valor_total: decimal(15,2) (nullable = true)
 |-- prod_unid: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: st

In [69]:
df_columns_to_drop = [
    'prod_unid',
    'prod_quant',
    'prod_valor_unit',
    'prod_valor_total'
]

df = df.drop(*df_columns_to_drop)
df.createOrReplaceTempView("df")

In [70]:
string_columns = [
    'nf_numero',
    'nf_data_emissao',
    'emit_cnpj',
    'emit_cep',
    'emit_municipio',
    'dest_cnpj',
    'dest_cep',
    'dest_municipio',
    'prod_cod',
    'prod_desc',
    'prod_ncm',
    'nome_regiao',
    'sigla_uf',
    'nome_municipio',
    'nome_mesoregiao',
    'nome_microregiao',
    'tipologia_rural_urbana',
    'hierarquia_urbana',
    'prod_unid_nn'
]

number_columns = [
    'nf_valor_total',
    'prod_nr_item',
    'ano',
    'pop',
    'pib',
    'pop_meso',
    'pib_meso',
    'log_prod_quant',
    'log_prod_valor_unit',
    'prod_quant_nn',
    'prod_valor_unit_nn',
    'prod_valor_total_nn',
    'scaled_log_prod_quant',
    'scaled_log_prod_valor_unit',
    'scaled_pop',
    'scaled_pib',
]

df_strings_null = df.select([F.count(F.when(F.col(c).contains('None') | \
                            F.col(c).contains('NULL') | \
                            (F.col(c) == '' ) | \
                            F.col(c).isNull(), c 
                           )).alias(c)
                    for c in string_columns])

df_numbers_null = df.select([F.count(F.when(F.col(c).contains('None') | \
                            F.col(c).contains('NULL') | \
                            (F.col(c) == '' ) | \
                            F.col(c).isNull() | \
                            F.isnan(c), c 
                           )).alias(c)
                    for c in number_columns])

In [71]:
#df_strings_null.show()

In [72]:
#df_numbers_null.show()

In [73]:
df.printSchema()

root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: date (nullable = true)
 |-- nf_valor_total: decimal(15,2) (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: integer (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |-- tipologia_rural_urbana: string (nullable = true)
 |-- hierarquia_urbana: string (nullable = true)
 |-- pop: integer (nullable = true)
 |-- pib: decimal(15,2) (nullable = t

In [74]:
'''df.createOrReplaceTempView("df")
df_select = spark.sql("SELECT prod_ncm FROM df WHERE prod_valor_unit_nn IS NULL LIMIT 5")
df_select.show()'''

'df.createOrReplaceTempView("df")\ndf_select = spark.sql("SELECT prod_ncm FROM df WHERE prod_valor_unit_nn IS NULL LIMIT 5")\ndf_select.show()'

In [81]:
df.createOrReplaceTempView("df")
df = spark.sql("SELECT * FROM df WHERE prod_ncm!='NA'")

In [83]:
number_columns = [
    'nf_valor_total',
    'prod_nr_item',
    'ano',
    'pop',
    'pib',
    'pop_meso',
    'pib_meso',
    'log_prod_quant',
    'log_prod_valor_unit',
    'prod_quant_nn',
    'prod_valor_unit_nn',
    'prod_valor_total_nn',
    'scaled_log_prod_quant',
    'scaled_log_prod_valor_unit',
    'scaled_pop',
    'scaled_pib',
]

df.createOrReplaceTempView("df")
df_numbers_null = df.select([F.count(F.when(F.col(c).contains('None') | \
                            F.col(c).contains('NULL') | \
                            (F.col(c) == '' ) | \
                            F.col(c).isNull() | \
                            F.isnan(c), c 
                           )).alias(c)
                    for c in number_columns])

df_numbers_null.show()

24/05/26 19:54:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 19:54:36 WARN RowBasedKeyValueBatch: Calling spill() on

+--------------+------------+---+---+---+--------+--------+--------------+-------------------+-------------+------------------+-------------------+---------------------+--------------------------+----------+----------+
|nf_valor_total|prod_nr_item|ano|pop|pib|pop_meso|pib_meso|log_prod_quant|log_prod_valor_unit|prod_quant_nn|prod_valor_unit_nn|prod_valor_total_nn|scaled_log_prod_quant|scaled_log_prod_valor_unit|scaled_pop|scaled_pib|
+--------------+------------+---+---+---+--------+--------+--------------+-------------------+-------------+------------------+-------------------+---------------------+--------------------------+----------+----------+
|             0|           0|  0|  0|  0|       0|       0|            16|                 14|           16|                14|                 16|                   16|                        14|         0|         0|
+--------------+------------+---+---+---+--------+--------+--------------+-------------------+-------------+----------------

                                                                                

In [77]:
'''df.createOrReplaceTempView("df")
df_select = spark.sql("SELECT prod_ncm FROM df WHERE prod_valor_unit_nn IS NULL LIMIT 5")
df_select.show()'''

'df.createOrReplaceTempView("df")\ndf_select = spark.sql("SELECT prod_ncm FROM df WHERE prod_valor_unit_nn IS NULL LIMIT 5")\ndf_select.show()'

In [78]:
'''spark.sql("CREATE DATABASE IF NOT EXISTS projeto")
spark.sql("USE projeto")
df.write.mode("overwrite").saveAsTable("notas_fiscais")'''

'spark.sql("CREATE DATABASE IF NOT EXISTS projeto")\nspark.sql("USE projeto")\ndf.write.mode("overwrite").saveAsTable("notas_fiscais")'

In [84]:
'''
root
 |-- nf_numero: string (nullable = true)
 |-- nf_data_emissao: date (nullable = true)
 |-- nf_valor_total: decimal(15,2) (nullable = true)
 |-- emit_cnpj: string (nullable = true)
 |-- emit_cep: string (nullable = true)
 |-- emit_municipio: string (nullable = true)
 |-- dest_cnpj: string (nullable = true)
 |-- dest_cep: string (nullable = true)
 |-- dest_municipio: string (nullable = true)
 |-- prod_nr_item: integer (nullable = true)
 |-- prod_cod: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_ncm: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- nome_regiao: string (nullable = true)
 |-- sigla_uf: string (nullable = true)
 |-- nome_municipio: string (nullable = true)
 |-- nome_mesoregiao: string (nullable = true)
 |-- nome_microregiao: string (nullable = true)
 |-- tipologia_rural_urbana: string (nullable = true)
 |-- hierarquia_urbana: string (nullable = true)
 |-- pop: integer (nullable = true)
 |-- pib: decimal(15,2) (nullable = true)
 |-- pop_meso: long (nullable = true)
 |-- pib_meso: decimal(25,2) (nullable = true)
 |-- log_prod_quant: double (nullable = true)
 |-- log_prod_valor_unit: double (nullable = true)
 |-- scaled_log_prod_quant: double (nullable = true)
 |-- scaled_log_prod_valor_unit: double (nullable = true)
 |-- scaled_pop: double (nullable = true)
 |-- scaled_pib: double (nullable = true)
 |-- prod_unid_nn: string (nullable = true)
 |-- prod_quant_nn: decimal(19,6) (nullable = true)
 |-- prod_valor_unit_nn: decimal(19,6) (nullable = true)
 |-- prod_valor_total_nn: decimal(38,11) (nullable = true)

'''

df.createOrReplaceTempView("df")
df_select = spark.sql("SELECT prod_desc, prod_ncm FROM df WHERE log_prod_quant IS NULL")
df_select.show()

                                                                                

+--------------------+--------------------+
|           prod_desc|            prod_ncm|
+--------------------+--------------------+
|"motor el�trico t...|POT�NCIA NOMINAL ...|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",48025610,NA,510...|
|placa material ep...|",48025610,NA,510...|
|"tubo galvanizado...|              00MMX6|
|"camisa gola ""o"...|       BASICA DE COR|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",18061000,NA,510...|
|placa material ep...|",48025610,NA,510...|
|placa material ep...|",48025610,NA,510...|
|placa material ep...|",48025610,NA,510...|
+--------------------+--------------------+



In [85]:
df = df.createOrReplaceTempView("df")
df = spark.sql("SELECT * FROM df WHERE log_prod_quant IS NOT NULL AND log_prod_valor_unit IS NOT NULL")

In [86]:
number_columns = [
    'nf_valor_total',
    'prod_nr_item',
    'ano',
    'pop',
    'pib',
    'pop_meso',
    'pib_meso',
    'log_prod_quant',
    'log_prod_valor_unit',
    'prod_quant_nn',
    'prod_valor_unit_nn',
    'prod_valor_total_nn',
    'scaled_log_prod_quant',
    'scaled_log_prod_valor_unit',
    'scaled_pop',
    'scaled_pib',
]

df.createOrReplaceTempView("df")
df_numbers_null = df.select([F.count(F.when(F.col(c).contains('None') | \
                            F.col(c).contains('NULL') | \
                            (F.col(c) == '' ) | \
                            F.col(c).isNull() | \
                            F.isnan(c), c 
                           )).alias(c)
                    for c in number_columns])

df_numbers_null.show()

24/05/26 20:11:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/05/26 20:11:33 WARN RowBasedKeyValueBatch: Calling spill() on

+--------------+------------+---+---+---+--------+--------+--------------+-------------------+-------------+------------------+-------------------+---------------------+--------------------------+----------+----------+
|nf_valor_total|prod_nr_item|ano|pop|pib|pop_meso|pib_meso|log_prod_quant|log_prod_valor_unit|prod_quant_nn|prod_valor_unit_nn|prod_valor_total_nn|scaled_log_prod_quant|scaled_log_prod_valor_unit|scaled_pop|scaled_pib|
+--------------+------------+---+---+---+--------+--------+--------------+-------------------+-------------+------------------+-------------------+---------------------+--------------------------+----------+----------+
|             0|           0|  0|  0|  0|       0|       0|             0|                  0|            0|                 0|                  0|                    0|                         0|         0|         0|
+--------------+------------+---+---+---+--------+--------+--------------+-------------------+-------------+----------------

                                                                                

In [79]:
#spark.stop()