In [0]:
# spark.conf.set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
# spark.conf.set("spark.hadoop.fs.gs.auth.service.account.email", "bucket-bigquery@leafy-environs-409823.iam.gserviceaccount.com")
# spark.conf.set("spark.hadoop.fs.gs.project.id", "leafy-environs-409823")
# spark.conf.set("spark.hadoop.fs.gs.auth.service.account.private.key", dbutils.secrets.get(scope='gcp-bucket', key='databricks-bucket-key'))
# spark.conf.set("spark.hadoop.fs.gs.auth.service.account.private.key.id", dbutils.secrets.get(scope='gcp-bucket', key='databricks-bucket-key_id'))

In [0]:
from datetime import datetime
import re
from pathlib import Path
from pyspark.sql.functions import col, when, regexp_replace, lit
from pyspark.sql.types import FloatType
import TypeError


In [0]:
# GCP secret -> steps to install
# databricks configure --token
# databricks secrets create-scope gcp-bucket --initial-manage-principal users
# databricks secrets put-secret gcp-bucket databricks-bucket-key
# databricks secrets put-secret gcp-bucket databricks-bucket-key_id
# Add config on Cluster Spark config https://docs.gcp.databricks.com/en/connect/storage/gcs.html

print(dbutils.secrets.listScopes())
print(dbutils.secrets.list('gcp-bucket'))

dbutils.fs.ls("gs://bossa-bucket-coutj/")


In [0]:

#READING datasus raw data parquet

dbutils.widgets.text("file_name", "", "Enter file_name")
file_name = dbutils.widgets.get('file_name') or '2016/folha_2016_1.parquet'
# print(file_name)
df = spark.read.parquet(f"gs://bossa-bucket-coutj/raw/{file_name}")

In [0]:
# Remove "-"

for column in df.columns:
    # print(column)
    df = df.withColumn(column, when( col(column)=='-', None).otherwise(col(column)))
    
# display(df)

In [0]:
# Fix currency format and cast string values to float
numeric_columns = [col for col in df.columns if col not in ['nome', 'cargo', 'funcao']]

for column in numeric_columns:
    df = df.withColumn(
                column, regexp_replace(col(column), "\.", "")
                ).withColumn(column, regexp_replace(column, ",", ".").cast("float"))


In [0]:
# Define data date

date_str = re.search(r"(?<=folha_)(.*)(?=\.parquet)",file_name).group(0).split('_')
date = datetime.strptime(f"{date_str[0]}/{date_str[1]}/1","%Y/%m/%d")
table_name = f"{date_str[0]}_{date_str[1]}"
# Add date column
df = df.withColumn('date',lit(date))

In [0]:
out_table_schema = {
    "nome": [
        "nome"
    ],
    "cargo": [
        "cargo"
    ],
    "funcao": [
        "funcao"
    ],
    "rendimento_do_funcionario": [
        "rendfunc",
        "rendfuncionario",
        "rendimentos_do_funcionario",
        "rendimento_func",
        "rendimento_funcionario",
        "vencimento",
        "rendimento_do_funcionario"
    ],
    "comissao": [
        "comissao"
    ],
    "representacao_grat_seguranca": [
        "representacao__grat_seguranca",
        "representacao_grat_segur",
        "representacao_grat_segur_qualificacao",
        "represent_grat_segur",
        "representacao_grat_seguranca",
        "representacao__grat_segur_grat_qualificacao_sfam",
        "representacao_grat",
        "representacao_qualificacao_grat_seguranca__s_fam",
        "representacao_grat_seguranca_grat_qualificacao_s_fam",
        "represent_gratseg",
        "representacao_grat_segur_grat_qualificacao_sfam"
    ],
    "incorporado": [
        "incorporado",
        "eignucrorporado"
    ],
    "trienio": [
        "trienio"
    ],
    "bolsa_reforco_escolar": [
        "abono_de_permanencia",
        "bolsa_reforco_escolar_abono_permanencia",
        "bolsa_reforco_escolar",
        "bolsa_reforco_escolar__abono_permanencia"
    ],
    "ferias": [
        "ferias"
    ],
    "redutor": [
        "redutor"
    ],
    "ipalerj_mensalidade": [
        "ipalerj__mensalidade",
        "ipalerj_mensalida_de",
        "ipalerj_mensalidade",
        "ipalerj_mens"
    ],
    "pensao_alimenticia": [
        "pensao_alimenticia"
    ],
    "previdencia_inss": [
        "previdencia_inss",
        "previdencia"
    ],
    "imposto_de_renda": [
        "ir",
        "imposto_de_renda",
        "imp_de_renda"
    ],
    "indenizatoria": [
        "indenizatoria"
    ],
    "rendimento_liquido": [
        "total_liquido",
        "rendimento_liquido"
    ],
    "mes_referencia": [
        "date"
    ]
}

# Fix column names
for column in df.columns:
    found = False
    for col_ref_name in out_table_schema:
        if column in out_table_schema[col_ref_name]:
            found = True
            df = df.withColumnRenamed(column, col_ref_name)
    if not found:
        raise Exception(f"Found columns not mapped on the schema: {column}")

# Adding empty non existing columns 
for schema_column in out_table_schema:
    if schema_column not in df.columns:
        df = df.withColumn(schema_column, lit(None))
        df = df.withColumn(schema_column, col(schema_column).cast(FloatType()))

In [0]:
# Remove rows with empty names

df = df.na.drop(subset=['nome'])


In [0]:
display(df)

In [0]:
from pyspark.sql.types import StructType
try:
    bq_lines = (spark.read.format('bigquery')
        .option('table', f"leafy-environs-409823.alerj_ds.alerj_payslip")
        .option("parentProject", 'leafy-environs-409823')
        .load())
    bq_lines.createOrReplaceTempView('bq_table_view')

    sql_query = f"""
        SELECT * FROM bq_table_view where 
            nome="{df.first()["nome"]}" AND
            mes_referencia="{df.first()["mes_referencia"]}" AND
            rendimento_liquido="{df.first()["rendimento_liquido"]}"
        """
    existing_data = spark.sql(sql_query)
except Exception as e:
    if "alerj_ds.alerj_payslip" in str(e.java_exception):
        empty_schema = StructType([])
        existing_data = spark.createDataFrame([], schema=empty_schema)
    else:
        raise Exception(f"Error Reading From BigQuery: {e}")


In [0]:
# Write dataframe to Bigquery:
if not len(existing_data.take(1)):
    (df.write.format("bigquery")
        .mode("append")
        .option("project", 'leafy-environs-409823')
        .option("parentProject", 'leafy-environs-409823')
        .option("temporaryGcsBucket","bossa-bucket-coutj")
        .option("table",f"leafy-environs-409823.alerj_ds.alerj_payslip")
        .save())
else:
    print("Data Already Exists")

In [0]:
df.write.mode('overwrite').parquet(f'gs://bossa-bucket-coutj/trusted/{file_name}')