In [1]:
import os

from dotenv import load_dotenv
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip

from engine import ClimaTempoAPI
from etl import cities_etl, forecast_72h_etl

In [8]:
load_dotenv()

builder = SparkSession.builder\
       .appName('raw_etl')\
       .config('spark.sql.warehouse.dir', 'pyspark_tables')\
       .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
       .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
       .config('spark.databricks.delta.retentionDurationCheck.enabled', False) \
       .config('spark.databricks.delta.schema.autoMerge.enabled', True) \
       .config('spark.databricks.delta.checkLatestSchemaOnRead', True) \
       .config('delta.enableChangeDataFeed', True) \
       .config('spark.sql.shuffle.partitions', 10) \
       .config('spark.databricks.preemption.enabled', True) \
       .config('spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite', True)

spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

23/04/25 03:40:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [9]:
TOKEN = os.environ['TOKEN']
DATABASE = os.environ['DATABASE']
CITIES_RAW_TB = os.environ['CITIES_RAW_TB']
FORECAST_72H_RAW_TB = os.environ['FORECAST_72H_RAW_TB']
FORECAST_72H_BRONZE_TB = os.environ['FORECAST_72H_BRONZE_TB']

spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE}").show()

++
||
++
++



In [6]:
api = ClimaTempoAPI(TOKEN)

cities_etl(api, CITIES_RAW_TB, spark, timeout=60, params={'country': 'BR'})
cities_etl(api, CITIES_RAW_TB, spark, timeout=60, params={'country': 'CO'})

cities_df = spark.read.format('delta').table(CITIES_RAW_TB)
cities_df.toPandas()

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


23/04/25 03:26:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


Unnamed: 0,id,name,state,country
0,143538,Paraíso das Águas,MS,BR
1,1868,Cachoeira Dourada,MG,BR
2,1899,Canápolis,MG,BR
3,1914,Capinópolis,MG,BR
4,1917,Caldas,MG,BR
...,...,...,...,...
6163,995,Soledad,ND,CO
6164,996,Tumaco,ND,CO
6165,997,Tunja,ND,CO
6166,998,Valledupar,ND,CO


In [34]:
# df = forecast_72h_etl(api, FORECAST_72H_RAW_TB, spark, [3477], params={'timeout': 60})

forecast72_df = spark.read.format('delta').table(FORECAST_72H_RAW_TB)
forecast72_df.toPandas()

Unnamed: 0,date,date_br,humidity,pressure,rain,wind,temperature,id,name,state,country,dt_processed,dt_processed_m
0,2023-04-27 18:00:00,27/04/2023 18:00:00,{'humidity': 60.9},{'pressure': 1010.1},{'precipitation': 0.7},"{'velocity': 11, 'direction': 'WSW', 'directio...",{'temperature': 23.6},3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
1,2023-04-27 19:00:00,27/04/2023 19:00:00,{'humidity': 71.5},{'pressure': 1010.7},{'precipitation': 0.7},"{'velocity': 9.6, 'direction': 'SSW', 'directi...",{'temperature': 24.5},3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
2,2023-04-27 20:00:00,27/04/2023 20:00:00,{'humidity': 79.4},{'pressure': 1011.1},{'precipitation': 0.7},"{'velocity': 11, 'direction': 'S', 'directiond...",{'temperature': 25.3},3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
3,2023-04-27 21:00:00,27/04/2023 21:00:00,{'humidity': 86.6},{'pressure': 1011.6},{'precipitation': 0.7},"{'velocity': 14.1, 'direction': 'S', 'directio...",{'temperature': 26},3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
4,2023-04-27 22:00:00,27/04/2023 22:00:00,{'humidity': 90.8},{'pressure': 1012.2},{'precipitation': 0.7},"{'velocity': 13.2, 'direction': 'S', 'directio...",{'temperature': 26},3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,2023-04-27 07:00:00,27/04/2023 07:00:00,{'humidity': 68.8},{'pressure': 1010.8},{'precipitation': 0},"{'velocity': 10.9, 'direction': 'NW', 'directi...",{'temperature': 19.9},3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01
284,2023-04-27 08:00:00,27/04/2023 08:00:00,{'humidity': 70.3},{'pressure': 1010.9},{'precipitation': 0},"{'velocity': 10.9, 'direction': 'NW', 'directi...",{'temperature': 19.4},3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01
285,2023-04-27 09:00:00,27/04/2023 09:00:00,{'humidity': 71.6},{'pressure': 1011.3},{'precipitation': 0},"{'velocity': 11.6, 'direction': 'NW', 'directi...",{'temperature': 19.2},3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01
286,2023-04-27 10:00:00,27/04/2023 10:00:00,{'humidity': 71.9},{'pressure': 1012},{'precipitation': 0},"{'velocity': 13, 'direction': 'NW', 'direction...",{'temperature': 18.9},3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01


In [37]:
spark.sql(f"""
SELECT
    TO_TIMESTAMP(`date`, 'yyyy-MM-dd HH:mm:ss') AS dt
    , CAST(GET_JSON_OBJECT(humidity, '$.humidity') AS DOUBLE) AS humidity
    , CAST(GET_JSON_OBJECT(pressure, '$.pressure') AS DOUBLE) AS pressure
    , CAST(GET_JSON_OBJECT(wind, '$.velocity') AS DOUBLE) AS wind_velocity
    , GET_JSON_OBJECT(wind, '$.direction') AS wind_direction
    , CAST(GET_JSON_OBJECT(wind, '$.directiondegrees') AS DOUBLE) AS wind_direction_degrees
    , CAST(GET_JSON_OBJECT(wind, '$.gust') AS DOUBLE) AS wind_gust
    , CAST(GET_JSON_OBJECT(temperature, '$.temperature') AS DOUBLE) AS temperature
    , CAST(`id` AS BIGINT) AS `id`
    , name
    , state
    , country
    , TO_TIMESTAMP(dt_processed, 'yyyy-MM-dd HH:mm:ss.SSSSSS') AS dt_processed
    , TO_DATE(dt_processed_m, 'yyyy-MM-dd') AS dt_processed_m
FROM {FORECAST_72H_RAW_TB}
""").toPandas()
# "{'velocity': 11, 'direction': 'WSW', 'directiondegrees': 262.2, 'gust': 20.2}"

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,dt,humidity,pressure,wind_velocity,wind_direction,wind_direction_degrees,wind_gust,temperature,id,name,state,country,dt_processed,dt_processed_m
0,2023-04-27 18:00:00,60.9,1010.1,11.0,WSW,262.2,20.2,23.6,3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
1,2023-04-27 19:00:00,71.5,1010.7,9.6,SSW,224.2,18.7,24.5,3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
2,2023-04-27 20:00:00,79.4,1011.1,11.0,S,191.2,15.5,25.3,3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
3,2023-04-27 21:00:00,86.6,1011.6,14.1,S,183.7,15.1,26.0,3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
4,2023-04-27 22:00:00,90.8,1012.2,13.2,S,181.8,9.7,26.0,3477,São Paulo,SP,BR,2023-04-25 03:16:27.198626,2023-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,2023-04-27 07:00:00,68.8,1010.8,10.9,NW,323.1,15.8,19.9,3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01
284,2023-04-27 08:00:00,70.3,1010.9,10.9,NW,321.7,17.6,19.4,3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01
285,2023-04-27 09:00:00,71.6,1011.3,11.6,NW,320.9,15.8,19.2,3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01
286,2023-04-27 10:00:00,71.9,1012.0,13.0,NW,318.7,16.6,18.9,3477,São Paulo,SP,BR,2023-04-25 03:15:53.497902,2023-04-01
