In [0]:
# spark.conf.set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
# spark.conf.set("spark.hadoop.fs.gs.auth.service.account.email", "bucket-bigquery@leafy-environs-409823.iam.gserviceaccount.com")
# spark.conf.set("spark.hadoop.fs.gs.project.id", "leafy-environs-409823")
# spark.conf.set("spark.hadoop.fs.gs.auth.service.account.private.key", dbutils.secrets.get(scope='gcp-bucket', key='databricks-bucket-key'))
# spark.conf.set("spark.hadoop.fs.gs.auth.service.account.private.key.id", dbutils.secrets.get(scope='gcp-bucket', key='databricks-bucket-key_id'))

In [0]:
from datetime import datetime
import re
from pathlib import Path
from pyspark.sql.functions import col, when, regexp_replace, lit

In [0]:
# GCP secret -> steps to install
# databricks configure --token
# databricks secrets create-scope gcp-bucket --initial-manage-principal users
# databricks secrets put-secret gcp-bucket databricks-bucket-key
# databricks secrets put-secret gcp-bucket databricks-bucket-key_id
# Add config on Cluster Spark config https://docs.gcp.databricks.com/en/connect/storage/gcs.html

print(dbutils.secrets.listScopes())
print(dbutils.secrets.list('gcp-bucket'))

dbutils.fs.ls("gs://bossa-bucket-coutj/")


In [0]:

#READING datasus raw data parquet

dbutils.widgets.text("file_name", "", "Enter file_name")
file_name = dbutils.widgets.get('file_name') or '2016/folha_2016_1.parquet'
# print(file_name)
df = spark.read.parquet(f"gs://bossa-bucket-coutj/raw/{file_name}")

In [0]:
# Remove "-"

for column in df.columns:
    # print(column)
    df = df.withColumn(column, when( col(column)=='-', None).otherwise(col(column)))
    
# display(df)

In [0]:
# Fix currency format and cast string values to float
numeric_columns = [col for col in df.columns if col not in ['nome', 'cargo', 'funcao']]

for column in numeric_columns:
    df = df.withColumn(
                column, regexp_replace(col(column), "\.", "")
                ).withColumn(column, regexp_replace(column, ",", ".").cast("float"))


In [0]:
# Fix column names

for column in df.columns:
    df = df.withColumnRenamed(column, column.replace("__", "_"))

In [0]:
# Define data date

date_str = re.search(r"(?<=folha_)(.*)(?=\.parquet)",file_name).group(0).split('_')
date = datetime.strptime(f"{date_str[0]}/{date_str[1]}/1","%Y/%m/%d")
table_name = f"{date_str[0]}_{date_str[1]}"
# Add date column
df = df.withColumn('date',lit(date))

In [0]:
# Remove rows with empty names

df = df.na.drop(subset=['nome'])


In [0]:
# Write dataframe to Bigquery:

(df.write.format("bigquery")
    .mode("overwrite")
    .option("project", 'leafy-environs-409823')
    .option("parentProject", 'leafy-environs-409823')
    .option("temporaryGcsBucket","bossa-bucket-coutj")
    .option("table",f"leafy-environs-409823.alerj_ds.alerj_{table_name}")
    .save())

In [0]:
df.write.mode('overwrite').parquet(f'gs://bossa-bucket-coutj/trusted/{file_name}')