# Criar a camada silver
## Vamos começar padronizando o nome de colunas, tipagem etc

In [2]:
import pandas as pd
import duckdb
from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import os

# Define os caminhos para as camadas
BRONZE_DIR = "data/bronze"
SILVER_DIR = "data/silver"

# Cria o diretório da camada Silver se não existir
os.makedirs(SILVER_DIR, exist_ok=True)

# Carrega os arquivos Bronze em DataFrames
df_meteo_bronze = pd.read_csv(os.path.join(BRONZE_DIR, 'dados_meteomatics.csv'))
df_wind_bronze = pd.read_csv(os.path.join(BRONZE_DIR, 'surfline_bronze_wind.csv'))

print("Dados da Meteomatics (Bronze):")
display(df_meteo_bronze.head(2))

print("\nDados de Vento (Bronze):")
display(df_wind_bronze.head(2))

Dados da Meteomatics (Bronze):


Unnamed: 0,validdate,t_2m:C,wind_dir_10m:d,wind_speed_10m:ms,mean_wave_direction:d,mean_wave_period:s,precip_1h:mm,pressure_2m:hPa,significant_wave_height:m,datetime_sp,datetime_sp_naive
0,2019-12-31 23:00:00+00:00,25.2,20.2,5.9,63.5,6.0,0.0,1007,1.33,2019-12-31 20:00:00-03:00,2019-12-31 20:00:00
1,2020-01-01 00:00:00+00:00,25.2,17.2,6.4,64.0,6.0,0.07,1008,1.31,2019-12-31 21:00:00-03:00,2019-12-31 21:00:00



Dados de Vento (Bronze):


Unnamed: 0,timestamp,utcOffset,speed,direction,directionType,gust,optimalScore,datetime
0,1577847600,-3,32.57164,3.22368,Offshore,39.26906,0,2020-01-01 00:00:00-03:00
1,1577851200,-3,32.57164,3.22368,Offshore,39.26906,0,2020-01-01 01:00:00-03:00


In [3]:
# Inicia uma conexão com o DuckDB (em memória)
con = duckdb.connect()

# "Registra" os DataFrames do Pandas como tabelas que o DuckDB pode consultar
con.register('bronze_meteomatics_raw', df_meteo_bronze)
con.register('bronze_wind_raw', df_wind_bronze)

# Vamos verificar se o DuckDB consegue ver as tabelas
con.execute("SHOW TABLES;").df()

Unnamed: 0,name
0,bronze_meteomatics_raw
1,bronze_wind_raw


In [4]:
# Query para limpar e padronizar os dados de vento
query_stg_wind = """
SELECT
    -- Converte a string de data para um timestamp com fuso horário
    CAST(datetime AS TIMESTAMPTZ) AS datetime_local,
    
    -- Converte para UTC para padronização
    CAST(datetime AS TIMESTAMPTZ) AT TIME ZONE 'UTC' as datetime_utc,

    -- Renomeia e converte os tipos das colunas
    CAST(speed AS DOUBLE) as wind_speed_kph,
    CAST(gust AS DOUBLE) as wind_gust_kph,
    CAST(direction AS DOUBLE) as wind_direction_deg,
    directionType as wind_direction_type,
    CAST(timestamp AS BIGINT) as timestamp_epoch

FROM bronze_wind_raw
"""

# Executa a query e salva o resultado em um novo DataFrame
df_wind_silver = con.execute(query_stg_wind).fetch_df()

print("Preview da Tabela de Vento (Silver):")
display(df_wind_silver.head())
df_wind_silver.info()

Preview da Tabela de Vento (Silver):


Unnamed: 0,datetime_local,datetime_utc,wind_speed_kph,wind_gust_kph,wind_direction_deg,wind_direction_type,timestamp_epoch
0,2020-01-01 00:00:00-03:00,2020-01-01 03:00:00,32.57164,39.26906,3.22368,Offshore,1577847600
1,2020-01-01 01:00:00-03:00,2020-01-01 04:00:00,32.57164,39.26906,3.22368,Offshore,1577851200
2,2020-01-01 02:00:00-03:00,2020-01-01 05:00:00,32.57164,39.26906,3.22368,Offshore,1577854800
3,2020-01-01 03:00:00-03:00,2020-01-01 06:00:00,32.83786,40.74897,1.37415,Offshore,1577858400
4,2020-01-01 04:00:00-03:00,2020-01-01 07:00:00,32.83786,40.74897,1.37415,Offshore,1577862000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype                            
---  ------               --------------  -----                            
 0   datetime_local       26496 non-null  datetime64[us, America/Sao_Paulo]
 1   datetime_utc         26496 non-null  datetime64[us]                   
 2   wind_speed_kph       26496 non-null  float64                          
 3   wind_gust_kph        26496 non-null  float64                          
 4   wind_direction_deg   26496 non-null  float64                          
 5   wind_direction_type  26496 non-null  object                           
 6   timestamp_epoch      26496 non-null  int64                            
dtypes: datetime64[us, America/Sao_Paulo](1), datetime64[us](1), float64(3), int64(1), object(1)
memory usage: 1.4+ MB


In [5]:
# Define o caminho para a nossa primeira Delta Table
silver_wind_path = os.path.join(SILVER_DIR, 'wind')

# Salva o DataFrame como uma Delta Table
# O 'mode='overwrite'' garante que, se rodarmos de novo, a tabela será substituída
write_deltalake(silver_wind_path, df_wind_silver, mode='overwrite')

print(f"Tabela 'wind' salva com sucesso como Delta Table em: '{silver_wind_path}'")

Tabela 'wind' salva com sucesso como Delta Table em: 'data/silver/wind'


In [6]:
# Lendo dados da Delta Table
dt_wind = DeltaTable(silver_wind_path)

# Convertendo para um DataFrame do Pandas para visualização
df_wind_from_delta = dt_wind.to_pandas()

print("Lendo dados de volta da Delta Table 'wind':")
display(df_wind_from_delta.head())

Lendo dados de volta da Delta Table 'wind':


Unnamed: 0,datetime_local,datetime_utc,wind_speed_kph,wind_gust_kph,wind_direction_deg,wind_direction_type,timestamp_epoch
0,2020-01-01 03:00:00+00:00,2020-01-01 03:00:00,32.57164,39.26906,3.22368,Offshore,1577847600
1,2020-01-01 04:00:00+00:00,2020-01-01 04:00:00,32.57164,39.26906,3.22368,Offshore,1577851200
2,2020-01-01 05:00:00+00:00,2020-01-01 05:00:00,32.57164,39.26906,3.22368,Offshore,1577854800
3,2020-01-01 06:00:00+00:00,2020-01-01 06:00:00,32.83786,40.74897,1.37415,Offshore,1577858400
4,2020-01-01 07:00:00+00:00,2020-01-01 07:00:00,32.83786,40.74897,1.37415,Offshore,1577862000


In [8]:
print("--- Iniciando Passos D e E para os dados da Meteomatics ---")

# ==============================================================================
# PASSO D: TRANSFORMAÇÃO COM SQL
# ==============================================================================

# Query para limpar, renomear e padronizar os dados da Meteomatics.
# Usamos a documentação para dar nomes claros e o padrão snake_case.
query_stg_meteomatics = """
SELECT
    -- A documentação diz que os dados são UTC.
    -- Criamos uma coluna UTC padronizada.
    CAST(validdate AS TIMESTAMPTZ) AS datetime_utc,

    -- Renomeia e converte os tipos das colunas
    CAST("t_2m:C" AS DOUBLE) as temperature_2m_celsius,
    CAST("wind_dir_10m:d" AS DOUBLE) as wind_direction_10m_deg,
    CAST("wind_speed_10m:ms" AS DOUBLE) as wind_speed_10m_ms,
    CAST("mean_wave_direction:d" AS DOUBLE) as mean_wave_direction_deg,
    CAST("mean_wave_period:s" AS DOUBLE) as mean_wave_period_s,
    CAST("precip_1h:mm" AS DOUBLE) as precipitation_1h_mm,
    CAST("pressure_2m:hPa" AS DOUBLE) as pressure_msl_hpa,
    CAST("significant_wave_height:m" AS DOUBLE) as significant_wave_height_m

FROM bronze_meteomatics_raw
"""

# Executa a query no DuckDB e salva o resultado em um novo DataFrame
df_meteo_silver = con.execute(query_stg_meteomatics).fetch_df()

print("\nPreview da Tabela da Meteomatics (Silver):")
display(df_meteo_silver.head())
print("\nInfo do DataFrame Silver:")
df_meteo_silver.info()


# ==============================================================================
# PASSO E: SALVANDO COMO DELTA TABLE
# ==============================================================================

# Define o caminho para a nossa nova Delta Table
silver_meteo_path = os.path.join(SILVER_DIR, 'meteomatics')

# Salva o DataFrame Silver como uma Delta Table
write_deltalake(silver_meteo_path, df_meteo_silver, mode='overwrite')

print(f"\nTabela 'meteomatics' salva com sucesso como Delta Table em: '{silver_meteo_path}'")

# Verificação final lendo os dados de volta da tabela recém-criada
print("\nVerificando... Lendo 5 linhas da nova Delta Table:")
display(DeltaTable(silver_meteo_path).to_pandas())

--- Iniciando Passos D e E para os dados da Meteomatics ---

Preview da Tabela da Meteomatics (Silver):


Unnamed: 0,datetime_utc,temperature_2m_celsius,wind_direction_10m_deg,wind_speed_10m_ms,mean_wave_direction_deg,mean_wave_period_s,precipitation_1h_mm,pressure_msl_hpa,significant_wave_height_m
0,2019-12-31 20:00:00-03:00,25.2,20.2,5.9,63.5,6.0,0.0,1007.0,1.33
1,2019-12-31 21:00:00-03:00,25.2,17.2,6.4,64.0,6.0,0.07,1008.0,1.31
2,2019-12-31 22:00:00-03:00,25.3,7.6,5.9,64.2,6.0,0.25,1008.0,1.31
3,2019-12-31 23:00:00-03:00,25.2,4.5,6.1,64.4,6.0,0.05,1008.0,1.3
4,2020-01-01 00:00:00-03:00,25.2,2.6,6.2,64.6,6.0,0.11,1008.0,1.3



Info do DataFrame Silver:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26305 entries, 0 to 26304
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype                            
---  ------                     --------------  -----                            
 0   datetime_utc               26305 non-null  datetime64[us, America/Sao_Paulo]
 1   temperature_2m_celsius     26305 non-null  float64                          
 2   wind_direction_10m_deg     26305 non-null  float64                          
 3   wind_speed_10m_ms          26305 non-null  float64                          
 4   mean_wave_direction_deg    26305 non-null  float64                          
 5   mean_wave_period_s         26305 non-null  float64                          
 6   precipitation_1h_mm        26305 non-null  float64                          
 7   pressure_msl_hpa           26305 non-null  float64                          
 8   significant_wave_height_m  26305 non-nul

Unnamed: 0,datetime_utc,temperature_2m_celsius,wind_direction_10m_deg,wind_speed_10m_ms,mean_wave_direction_deg,mean_wave_period_s,precipitation_1h_mm,pressure_msl_hpa,significant_wave_height_m
0,2019-12-31 23:00:00+00:00,25.2,20.2,5.9,63.5,6.0,0.00,1007.0,1.33
1,2020-01-01 00:00:00+00:00,25.2,17.2,6.4,64.0,6.0,0.07,1008.0,1.31
2,2020-01-01 01:00:00+00:00,25.3,7.6,5.9,64.2,6.0,0.25,1008.0,1.31
3,2020-01-01 02:00:00+00:00,25.2,4.5,6.1,64.4,6.0,0.05,1008.0,1.30
4,2020-01-01 03:00:00+00:00,25.2,2.6,6.2,64.6,6.0,0.11,1008.0,1.30
...,...,...,...,...,...,...,...,...,...
26300,2022-12-31 19:00:00+00:00,25.5,24.5,6.3,76.0,6.4,2.39,1013.0,1.02
26301,2022-12-31 20:00:00+00:00,25.2,16.5,5.2,75.8,6.4,0.22,1013.0,1.01
26302,2022-12-31 21:00:00+00:00,24.8,21.0,4.3,75.7,6.4,0.00,1013.0,1.00
26303,2022-12-31 22:00:00+00:00,24.3,25.8,4.2,75.1,6.4,0.00,1013.0,1.00


# ETAPA 3: RATING - Classificação do Surf
## Transformando os dados de rating/classificação das condições do surf

In [38]:
# ==============================================================================
# CARREGAMENTO DOS DADOS DE RATING
# ==============================================================================

# Carrega os dados de rating do arquivo parquet
df_rating_bronze = pd.read_parquet(os.path.join(BRONZE_DIR, 'surfline_bronze_rating.parquet'))

print("=== ANÁLISE INICIAL DOS DADOS DE RATING ===")
print(f"Shape: {df_rating_bronze.shape}")
print(f"Período: {df_rating_bronze['datetime'].min()} até {df_rating_bronze['datetime'].max()}")
print("\nPrimeiras linhas:")
display(df_rating_bronze.head())

print("\nTipos de dados:")
print(df_rating_bronze.dtypes)

print("\nValores únicos de rating:")
print(df_rating_bronze[['rating.key', 'rating.value']].value_counts().sort_index())

=== ANÁLISE INICIAL DOS DADOS DE RATING ===
Shape: (26304, 5)
Período: 2020-01-01 00:00:00-03:00 até 2022-12-31 23:00:00-03:00

Primeiras linhas:


Unnamed: 0,timestamp,utcOffset,rating.key,rating.value,datetime
0,1577847600,-3,POOR,1,2020-01-01 00:00:00-03:00
1,1577851200,-3,POOR,1,2020-01-01 01:00:00-03:00
2,1577854800,-3,POOR,1,2020-01-01 02:00:00-03:00
3,1577858400,-3,POOR,1,2020-01-01 03:00:00-03:00
4,1577862000,-3,POOR,1,2020-01-01 04:00:00-03:00



Tipos de dados:
timestamp                                   int64
utcOffset                                   int64
rating.key                                 object
rating.value                                int64
datetime        datetime64[ns, America/Sao_Paulo]
dtype: object

Valores únicos de rating:
rating.key    rating.value
FAIR          3               4912
FAIR_TO_GOOD  4                337
POOR          1               9499
POOR_TO_FAIR  2               8107
VERY_POOR     0               3449
Name: count, dtype: int64


In [None]:
# ==============================================================================
# TRANSFORMAÇÃO DOS DADOS DE RATING COM SQL
# ==============================================================================

# Registra a tabela de rating no DuckDB
con.register('bronze_rating_raw', df_rating_bronze)

# Query para limpar e padronizar os dados de rating
query_stg_rating = """
SELECT
    -- Padroniza timestamp para UTC
    -- Os dados já vêm com fuso horário, convertemos para UTC
    CAST(datetime AT TIME ZONE 'UTC' AS TIMESTAMPTZ) AS datetime_utc,
    
    -- Mantém timestamp original para referência
    CAST(timestamp AS BIGINT) as timestamp_epoch,
    
    -- Padroniza nomes das colunas de rating
    "rating.key" as rating_category,
    CAST("rating.value" AS INTEGER) as rating_score,
    
    -- Adiciona informações úteis
    CASE 
        WHEN "rating.key" = 'POOR' THEN 'Ruim'
        WHEN "rating.key" = 'POOR_TO_FAIR' THEN 'Ruim para Regular'
        WHEN "rating.key" = 'FAIR' THEN 'Regular'
        WHEN "rating.key" = 'FAIR_TO_GOOD' THEN 'Regular para Bom'
        WHEN "rating.key" = 'GOOD' THEN 'Bom'
        WHEN "rating.key" = 'VERY_GOOD' THEN 'Muito Bom'
        WHEN "rating.key" = 'GOOD_TO_EPIC' THEN 'Bom para Épico'
        WHEN "rating.key" = 'EPIC' THEN 'Épico'
        ELSE "rating.key"
    END as rating_description_pt,
    
    -- Converte para escala 0-10 (assumindo que 1-5 é a escala original)
    CAST("rating.value" AS DOUBLE) * 2.0 as rating_score_normalized

FROM bronze_rating_raw
ORDER BY datetime_utc
"""

# Executa a query e salva o resultado
df_rating_silver = con.execute(query_stg_rating).fetch_df()

print("=== DADOS DE RATING TRANSFORMADOS (SILVER) ===")
print(f"Shape: {df_rating_silver.shape}")
display(df_rating_silver.head())

print("\nInfo do DataFrame:")
df_rating_silver.info()

print("\nDistribuição dos ratings:")
print(df_rating_silver[['rating_category', 'rating_score', 'rating_description_pt']].value_counts().sort_index())

ParserException: Parser Error: syntax error at or near ")"

In [12]:
# ==============================================================================
# SALVANDO RATING COMO DELTA TABLE
# ==============================================================================

# Define o caminho para a Delta Table de rating
silver_rating_path = os.path.join(SILVER_DIR, 'rating')

# Salva o DataFrame como uma Delta Table
write_deltalake(silver_rating_path, df_rating_silver, mode='overwrite')

print(f"✅ Tabela 'rating' salva com sucesso como Delta Table em: '{silver_rating_path}'")

# Verificação: lendo dados de volta da Delta Table
dt_rating = DeltaTable(silver_rating_path)
df_rating_verification = dt_rating.to_pandas()

print("\n=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===")
print(f"Registros: {len(df_rating_verification)}")
display(df_rating_verification.head())

print("\nEstatísticas dos ratings:")
print(df_rating_verification['rating_score'].describe())

✅ Tabela 'rating' salva com sucesso como Delta Table em: 'data/silver/rating'

=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===
Registros: 26304


Unnamed: 0,datetime_utc,timestamp_epoch,rating_category,rating_score
0,2020-01-01 06:00:00+00:00,1577847600,POOR,1
1,2020-01-01 07:00:00+00:00,1577851200,POOR,1
2,2020-01-01 08:00:00+00:00,1577854800,POOR,1
3,2020-01-01 09:00:00+00:00,1577858400,POOR,1
4,2020-01-01 10:00:00+00:00,1577862000,POOR,1



Estatísticas dos ratings:
count    26304.000000
mean         1.588998
std          0.977091
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: rating_score, dtype: float64


# ETAPA 4: SURF - Altura das Ondas
## Transformando os dados de altura das ondas (min/max)

In [13]:
# ==============================================================================
# CARREGAMENTO DOS DADOS DE SURF
# ==============================================================================

# Carrega os dados de surf do arquivo parquet
df_surf_bronze = pd.read_parquet(os.path.join(BRONZE_DIR, 'surfline_bronze_surf.parquet'))

print("=== ANÁLISE INICIAL DOS DADOS DE SURF ===")
print(f"Shape: {df_surf_bronze.shape}")
print(f"Período: {df_surf_bronze['datetime'].min()} até {df_surf_bronze['datetime'].max()}")
print("\nPrimeiras linhas:")
display(df_surf_bronze.head())

print("\nTipos de dados:")
print(df_surf_bronze.dtypes)

print("\nEstatísticas das alturas:")
print("Surf Min (ft):", df_surf_bronze['surf.min'].describe())
print("\nSurf Max (ft):", df_surf_bronze['surf.max'].describe())

print("\nInfo sobre dados brutos vs processados:")
print("Raw Min:", df_surf_bronze['surf.raw.min'].describe())
print("\nRaw Max:", df_surf_bronze['surf.raw.max'].describe())

=== ANÁLISE INICIAL DOS DADOS DE SURF ===
Shape: (26304, 9)
Período: 2020-01-01 00:00:00-03:00 até 2022-12-31 23:00:00-03:00

Primeiras linhas:


Unnamed: 0,timestamp,utcOffset,surf.min,surf.max,surf.plus,surf.humanRelation,surf.raw.min,surf.raw.max,datetime
0,1579230000,-3,1.2,1.8,False,Chest to overhead,1.21067,1.89166,2020-01-17 00:00:00-03:00
1,1579233600,-3,1.2,1.8,False,Chest to overhead,1.21067,1.89166,2020-01-17 01:00:00-03:00
2,1579237200,-3,1.2,1.8,False,Chest to overhead,1.21067,1.89166,2020-01-17 02:00:00-03:00
3,1579240800,-3,1.2,1.8,False,Chest to overhead,1.14997,1.79683,2020-01-17 03:00:00-03:00
4,1579244400,-3,1.2,1.8,False,Chest to overhead,1.14997,1.79683,2020-01-17 04:00:00-03:00



Tipos de dados:
timestamp                                         int64
utcOffset                                         int64
surf.min                                        float64
surf.max                                        float64
surf.plus                                          bool
surf.humanRelation                               object
surf.raw.min                                    float64
surf.raw.max                                    float64
datetime              datetime64[ns, America/Sao_Paulo]
dtype: object

Estatísticas das alturas:
Surf Min (ft): count    26304.000000
mean         0.854596
std          0.359468
min          0.000000
25%          0.600000
50%          0.900000
75%          0.900000
max          2.400000
Name: surf.min, dtype: float64

Surf Max (ft): count    26304.000000
mean         1.339393
std          0.529009
min          0.300000
25%          0.900000
50%          1.200000
75%          1.500000
max          3.700000
Name: surf.max, dtype: f

In [17]:
# ==============================================================================
# TRANSFORMAÇÃO DOS DADOS DE SURF COM SQL
# ==============================================================================

# Registra a tabela de surf no DuckDB
con.register('bronze_surf_raw', df_surf_bronze)

# Query para limpar e padronizar os dados de surf
query_stg_surf = """
SELECT
    -- Padroniza timestamp para UTC
    CAST(datetime AT TIME ZONE 'UTC' AS TIMESTAMPTZ) AS datetime_utc,
    
    -- Mantém timestamp original para referência
    CAST(timestamp AS BIGINT) as timestamp_epoch,
    
    -- Dados de altura das ondas processados (em metros)
    CAST("surf.min" AS DOUBLE) as wave_height_min_m,
    CAST("surf.max" AS DOUBLE) as wave_height_max_m,
    
    -- Calcula altura média e amplitude
    (CAST("surf.min" AS DOUBLE) + CAST("surf.max" AS DOUBLE)) / 2.0 as wave_height_avg_m,

    CAST("surf.max" AS DOUBLE) - CAST("surf.min" AS DOUBLE) as wave_height_range_m,

    -- Dados brutos originais (sem processamento)
    CAST("surf.raw.min" AS DOUBLE) as wave_height_raw_min_m,
    CAST("surf.raw.max" AS DOUBLE) as wave_height_raw_max_m,
    
    -- Informações adicionais
    CAST("surf.plus" AS BOOLEAN) as has_plus_conditions,
    "surf.humanRelation" as size_description,
    
    -- Categorização da altura das ondas (em metros)
    CASE 
        WHEN CAST("surf.max" AS DOUBLE) <= 0.6 THEN 'Meio Metrinho'
        WHEN CAST("surf.max" AS DOUBLE) < 1 THEN 'Meio Metro'
        WHEN CAST("surf.max" AS DOUBLE) < 1.5 THEN 'Meio Metrão'
        WHEN CAST("surf.max" AS DOUBLE) < 2 THEN 'Um Metro'
        WHEN CAST("surf.max" AS DOUBLE) < 5 THEN '2 Conto'
        ELSE 'Gigante'
    END as wave_size_category

FROM bronze_surf_raw
ORDER BY datetime_utc
"""

# Executa a query e salva o resultado
df_surf_silver = con.execute(query_stg_surf).fetch_df()

print("=== DADOS DE SURF TRANSFORMADOS (SILVER) ===")
print(f"Shape: {df_surf_silver.shape}")
display(df_surf_silver.head())

print("\nInfo do DataFrame:")
df_surf_silver.info()

print("\nDistribuição das categorias de tamanho:")
print(df_surf_silver['wave_size_category'].value_counts())

=== DADOS DE SURF TRANSFORMADOS (SILVER) ===
Shape: (26304, 11)


Unnamed: 0,datetime_utc,timestamp_epoch,wave_height_min_m,wave_height_max_m,wave_height_avg_m,wave_height_range_m,wave_height_raw_min_m,wave_height_raw_max_m,has_plus_conditions,size_description,wave_size_category
0,2020-01-01 03:00:00-03:00,1577847600,0.6,0.9,0.75,0.3,0.52846,0.82572,False,Thigh to waist,Meio Metro
1,2020-01-01 04:00:00-03:00,1577851200,0.6,0.9,0.75,0.3,0.52846,0.82572,False,Thigh to waist,Meio Metro
2,2020-01-01 05:00:00-03:00,1577854800,0.6,0.9,0.75,0.3,0.52846,0.82572,False,Thigh to waist,Meio Metro
3,2020-01-01 06:00:00-03:00,1577858400,0.6,0.9,0.75,0.3,0.57381,0.89657,False,Thigh to waist,Meio Metro
4,2020-01-01 07:00:00-03:00,1577862000,0.6,0.9,0.75,0.3,0.57381,0.89657,False,Thigh to waist,Meio Metro



Info do DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26304 entries, 0 to 26303
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype                            
---  ------                 --------------  -----                            
 0   datetime_utc           26304 non-null  datetime64[us, America/Sao_Paulo]
 1   timestamp_epoch        26304 non-null  int64                            
 2   wave_height_min_m      26304 non-null  float64                          
 3   wave_height_max_m      26304 non-null  float64                          
 4   wave_height_avg_m      26304 non-null  float64                          
 5   wave_height_range_m    26304 non-null  float64                          
 6   wave_height_raw_min_m  26304 non-null  float64                          
 7   wave_height_raw_max_m  26304 non-null  float64                          
 8   has_plus_conditions    26304 non-null  bool                             
 9   size_des

In [19]:
# ==============================================================================
# SALVANDO SURF COMO DELTA TABLE
# ==============================================================================

# Define o caminho para a Delta Table de surf
silver_surf_path = os.path.join(SILVER_DIR, 'surf')

# Salva o DataFrame como uma Delta Table
write_deltalake(silver_surf_path, df_surf_silver, mode='overwrite')

print(f"✅ Tabela 'surf' salva com sucesso como Delta Table em: '{silver_surf_path}'")

# Verificação: lendo dados de volta da Delta Table
dt_surf = DeltaTable(silver_surf_path)
df_surf_verification = dt_surf.to_pandas()

print("\n=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===")
print(f"Registros: {len(df_surf_verification)}")
display(df_surf_verification.head())

print("\nEstatísticas das alturas das ondas:")
print("Altura média (metros):")
print(df_surf_verification['wave_height_avg_m'].describe())

print("\nDistribuição por categoria de tamanho:")
print(df_surf_verification['wave_size_category'].value_counts())

print("\nComparação dados processados vs raw:")
print("Diferença média (min):", (df_surf_verification['wave_height_min_m'] - df_surf_verification['wave_height_raw_min_m']).mean())
print("Diferença média (max):", (df_surf_verification['wave_height_max_m'] - df_surf_verification['wave_height_raw_max_m']).mean())

✅ Tabela 'surf' salva com sucesso como Delta Table em: 'data/silver/surf'

=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===
Registros: 26304


Unnamed: 0,datetime_utc,timestamp_epoch,wave_height_min_m,wave_height_max_m,wave_height_avg_m,wave_height_range_m,wave_height_raw_min_m,wave_height_raw_max_m,has_plus_conditions,size_description,wave_size_category
0,2020-01-01 06:00:00+00:00,1577847600,0.6,0.9,0.75,0.3,0.52846,0.82572,False,Thigh to waist,Meio Metro
1,2020-01-01 07:00:00+00:00,1577851200,0.6,0.9,0.75,0.3,0.52846,0.82572,False,Thigh to waist,Meio Metro
2,2020-01-01 08:00:00+00:00,1577854800,0.6,0.9,0.75,0.3,0.52846,0.82572,False,Thigh to waist,Meio Metro
3,2020-01-01 09:00:00+00:00,1577858400,0.6,0.9,0.75,0.3,0.57381,0.89657,False,Thigh to waist,Meio Metro
4,2020-01-01 10:00:00+00:00,1577862000,0.6,0.9,0.75,0.3,0.57381,0.89657,False,Thigh to waist,Meio Metro



Estatísticas das alturas das ondas:
Altura média (metros):
count    26304.000000
mean         1.096995
std          0.441119
min          0.150000
25%          0.750000
50%          1.050000
75%          1.200000
max          3.050000
Name: wave_height_avg_m, dtype: float64

Distribuição por categoria de tamanho:
wave_size_category
Meio Metrão      9501
Um Metro         5712
Meio Metro       4641
2 Conto          3675
Meio Metrinho    2775
Name: count, dtype: int64

Comparação dados processados vs raw:
Diferença média (min): -0.0061551072080292014
Diferença média (max): -0.005530846259124084


# ETAPA 5: SWELLS - Ondulações Detalhadas
## Transformando dados de múltiplos swells por timestamp (altura, período, direção)

In [20]:
# ==============================================================================
# CARREGAMENTO DOS DADOS DE SWELLS
# ==============================================================================

# Carrega os dados de swells do arquivo parquet
df_swells_bronze = pd.read_parquet(os.path.join(BRONZE_DIR, 'surfline_bronze_swells.parquet'))

print("=== ANÁLISE INICIAL DOS DADOS DE SWELLS ===")
print(f"Shape: {df_swells_bronze.shape}")
print(f"Período: {df_swells_bronze['datetime'].min()} até {df_swells_bronze['datetime'].max()}")
print("\nPrimeiras linhas:")
display(df_swells_bronze.head())

print("\nTipos de dados:")
print(df_swells_bronze.dtypes)

print("\nEstatísticas das ondulações:")
print("Altura (metros):", df_swells_bronze['height'].describe())
print("\nPeríodo (segundos):", df_swells_bronze['period'].describe())
print("\nDireção (graus):", df_swells_bronze['direction'].describe())

print("\nContagem de swells por timestamp:")
swells_per_timestamp = df_swells_bronze.groupby('datetime').size()
print(f"Mínimo: {swells_per_timestamp.min()}, Máximo: {swells_per_timestamp.max()}, Média: {swells_per_timestamp.mean():.1f}")

print(f"\nDistribuição de número de swells por timestamp:")
print(swells_per_timestamp.value_counts().sort_index().head(10))

=== ANÁLISE INICIAL DOS DADOS DE SWELLS ===
Shape: (157824, 10)
Período: 2020-01-01 00:00:00-03:00 até 2022-12-31 23:00:00-03:00

Primeiras linhas:


Unnamed: 0,height,period,impact,power,direction,directionMin,total_timestamp,total_utcOffset,total_power,datetime
0,0.0,0,0.0,0.0,0.0,0.0,1577847600,-3,79.52864,2020-01-01 00:00:00-03:00
1,0.06612,7,0.0887,0.03522,148.76266,143.044125,1577847600,-3,79.52864,2020-01-01 00:00:00-03:00
2,1.13461,6,0.28,25.00685,51.30005,34.12848,1577847600,-3,79.52864,2020-01-01 00:00:00-03:00
3,0.81898,9,0.5516,54.48657,83.8667,78.374705,1577847600,-3,79.52864,2020-01-01 00:00:00-03:00
4,0.0,0,0.0,0.0,0.0,0.0,1577847600,-3,79.52864,2020-01-01 00:00:00-03:00



Tipos de dados:
height                                       float64
period                                         int64
impact                                       float64
power                                        float64
direction                                    float64
directionMin                                 float64
total_timestamp                                int64
total_utcOffset                                int64
total_power                                  float64
datetime           datetime64[ns, America/Sao_Paulo]
dtype: object

Estatísticas das ondulações:
Altura (metros): count    157824.000000
mean          0.396676
std           0.569184
min           0.000000
25%           0.000000
50%           0.089040
75%           0.657920
max           4.840630
Name: height, dtype: float64

Período (segundos): count    157824.000000
mean          4.823791
std           5.088782
min           0.000000
25%           0.000000
50%           4.000000
75%           9.0000

In [25]:
# ==============================================================================
# TRANSFORMAÇÃO DOS DADOS DE SWELLS COM SQL - SWELLS INDIVIDUAIS POR TIMESTAMP
# ==============================================================================

# Registra a tabela de swells no DuckDB
con.register('bronze_swells_raw', df_swells_bronze)

# Query para preservar swells individuais ordenados por altura (maior para menor)
query_stg_swells = """
WITH ranked_swells AS (
    SELECT 
        datetime,
        total_timestamp,
        height,
        period,
        direction,
        power,
        impact,
        -- Ranqueia os swells por altura (1 = maior swell)
        ROW_NUMBER() OVER (PARTITION BY datetime ORDER BY height DESC) as swell_rank
    FROM bronze_swells_raw
),
pivoted_swells AS (
    SELECT
        datetime,
        total_timestamp,
        
        -- Swell 1 (dominante - maior altura)
        MAX(CASE WHEN swell_rank = 1 THEN height END) as swell_1_height_m,
        MAX(CASE WHEN swell_rank = 1 THEN period END) as swell_1_period_s,
        MAX(CASE WHEN swell_rank = 1 THEN direction END) as swell_1_direction_deg,
        MAX(CASE WHEN swell_rank = 1 THEN power END) as swell_1_power,
        MAX(CASE WHEN swell_rank = 1 THEN impact END) as swell_1_impact,
        
        -- Swell 2 (segundo maior)
        MAX(CASE WHEN swell_rank = 2 THEN height END) as swell_2_height_m,
        MAX(CASE WHEN swell_rank = 2 THEN period END) as swell_2_period_s,
        MAX(CASE WHEN swell_rank = 2 THEN direction END) as swell_2_direction_deg,
        MAX(CASE WHEN swell_rank = 2 THEN power END) as swell_2_power,
        MAX(CASE WHEN swell_rank = 2 THEN impact END) as swell_2_impact,
        
        -- Swell 3 (terceiro maior)
        MAX(CASE WHEN swell_rank = 3 THEN height END) as swell_3_height_m,
        MAX(CASE WHEN swell_rank = 3 THEN period END) as swell_3_period_s,
        MAX(CASE WHEN swell_rank = 3 THEN direction END) as swell_3_direction_deg,
        MAX(CASE WHEN swell_rank = 3 THEN power END) as swell_3_power,
        MAX(CASE WHEN swell_rank = 3 THEN impact END) as swell_3_impact,
        
        -- Swell 4 (quarto maior)
        MAX(CASE WHEN swell_rank = 4 THEN height END) as swell_4_height_m,
        MAX(CASE WHEN swell_rank = 4 THEN period END) as swell_4_period_s,
        MAX(CASE WHEN swell_rank = 4 THEN direction END) as swell_4_direction_deg,
        MAX(CASE WHEN swell_rank = 4 THEN power END) as swell_4_power,
        MAX(CASE WHEN swell_rank = 4 THEN impact END) as swell_4_impact,
        
        -- Swell 5 (quinto maior)
        MAX(CASE WHEN swell_rank = 5 THEN height END) as swell_5_height_m,
        MAX(CASE WHEN swell_rank = 5 THEN period END) as swell_5_period_s,
        MAX(CASE WHEN swell_rank = 5 THEN direction END) as swell_5_direction_deg,
        MAX(CASE WHEN swell_rank = 5 THEN power END) as swell_5_power,
        MAX(CASE WHEN swell_rank = 5 THEN impact END) as swell_5_impact,
        
        -- Swell 6 (menor)
        MAX(CASE WHEN swell_rank = 6 THEN height END) as swell_6_height_m,
        MAX(CASE WHEN swell_rank = 6 THEN period END) as swell_6_period_s,
        MAX(CASE WHEN swell_rank = 6 THEN direction END) as swell_6_direction_deg,
        MAX(CASE WHEN swell_rank = 6 THEN power END) as swell_6_power,
        MAX(CASE WHEN swell_rank = 6 THEN impact END) as swell_6_impact
        
    FROM ranked_swells
    GROUP BY datetime, total_timestamp
)
SELECT
    -- Padroniza timestamp para UTC
    CAST(datetime AT TIME ZONE 'UTC' AS TIMESTAMPTZ) AS datetime_utc,
    CAST(total_timestamp AS BIGINT) as timestamp_epoch,
    
    -- Todos os swells individuais (preservando valores originais)
    swell_1_height_m, swell_1_period_s, swell_1_direction_deg, swell_1_power, swell_1_impact,
    swell_2_height_m, swell_2_period_s, swell_2_direction_deg, swell_2_power, swell_2_impact,
    swell_3_height_m, swell_3_period_s, swell_3_direction_deg, swell_3_power, swell_3_impact,
    swell_4_height_m, swell_4_period_s, swell_4_direction_deg, swell_4_power, swell_4_impact,
    swell_5_height_m, swell_5_period_s, swell_5_direction_deg, swell_5_power, swell_5_impact,
    swell_6_height_m, swell_6_period_s, swell_6_direction_deg, swell_6_power, swell_6_impact,
    
    -- Métricas derivadas baseadas no swell dominante
    CASE 
        WHEN swell_1_height_m < 0.5 THEN 'Muito Pequeno'
        WHEN swell_1_height_m < 1.0 THEN 'Pequeno'
        WHEN swell_1_height_m < 1.5 THEN 'Médio'
        WHEN swell_1_height_m < 2.0 THEN 'Grande'
        WHEN swell_1_height_m < 3.0 THEN 'Muito Grande'
        ELSE 'Gigante'
    END as primary_swell_category,
    
    -- Direção do swell dominante
    CASE 
        WHEN swell_1_direction_deg BETWEEN 0 AND 45 THEN 'Norte'
        WHEN swell_1_direction_deg BETWEEN 46 AND 135 THEN 'Leste'
        WHEN swell_1_direction_deg BETWEEN 136 AND 225 THEN 'Sul'
        WHEN swell_1_direction_deg BETWEEN 226 AND 315 THEN 'Oeste'
        ELSE 'Norte'
    END as primary_swell_direction,
    
    -- Poder total de todos os swells
    COALESCE(swell_1_power, 0) + COALESCE(swell_2_power, 0) + COALESCE(swell_3_power, 0) + 
    COALESCE(swell_4_power, 0) + COALESCE(swell_5_power, 0) + COALESCE(swell_6_power, 0) as total_swell_power

FROM pivoted_swells
ORDER BY datetime_utc
"""

# Executa a query e salva o resultado
df_swells_silver = con.execute(query_stg_swells).fetch_df()

print("=== DADOS DE SWELLS TRANSFORMADOS (SILVER) - SWELLS INDIVIDUAIS ===")
print(f"Shape: {df_swells_silver.shape}")
display(df_swells_silver.head())

print("\nInfo do DataFrame:")
df_swells_silver.info()

print("\nDistribuição das categorias do swell dominante:")
print(df_swells_silver['primary_swell_category'].value_counts())

print("\nDistribuição das direções do swell dominante:")
print(df_swells_silver['primary_swell_direction'].value_counts())

=== DADOS DE SWELLS TRANSFORMADOS (SILVER) - SWELLS INDIVIDUAIS ===
Shape: (26304, 35)


Unnamed: 0,datetime_utc,timestamp_epoch,swell_1_height_m,swell_1_period_s,swell_1_direction_deg,swell_1_power,swell_1_impact,swell_2_height_m,swell_2_period_s,swell_2_direction_deg,...,swell_5_power,swell_5_impact,swell_6_height_m,swell_6_period_s,swell_6_direction_deg,swell_6_power,swell_6_impact,primary_swell_category,primary_swell_direction,total_swell_power
0,2020-01-01 03:00:00-03:00,1577847600,1.13461,6,51.30005,25.00685,0.28,0.81898,9,83.8667,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,79.52864
1,2020-01-01 04:00:00-03:00,1577851200,1.13461,6,51.30005,25.00685,0.28,0.81898,9,83.8667,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,79.52864
2,2020-01-01 05:00:00-03:00,1577854800,1.13461,6,51.30005,25.00685,0.28,0.81898,9,83.8667,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,79.52864
3,2020-01-01 06:00:00-03:00,1577858400,1.3966,8,63.56409,204.45764,0.7611,0.05794,7,150.1218,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,204.48899
4,2020-01-01 07:00:00-03:00,1577862000,1.3966,8,63.56409,204.45764,0.7611,0.05794,7,150.1218,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,204.48899



Info do DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26304 entries, 0 to 26303
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype                            
---  ------                   --------------  -----                            
 0   datetime_utc             26304 non-null  datetime64[us, America/Sao_Paulo]
 1   timestamp_epoch          26304 non-null  int64                            
 2   swell_1_height_m         26304 non-null  float64                          
 3   swell_1_period_s         26304 non-null  int64                            
 4   swell_1_direction_deg    26304 non-null  float64                          
 5   swell_1_power            26304 non-null  float64                          
 6   swell_1_impact           26304 non-null  float64                          
 7   swell_2_height_m         26304 non-null  float64                          
 8   swell_2_period_s         26304 non-null  int64                    

In [27]:
# ==============================================================================
# SALVANDO SWELLS COMO DELTA TABLE
# ==============================================================================

# Define o caminho para a Delta Table de swells
silver_swells_path = os.path.join(SILVER_DIR, 'swells')

# Salva o DataFrame como uma Delta Table
write_deltalake(silver_swells_path, df_swells_silver, mode='overwrite')

print(f"✅ Tabela 'swells' salva com sucesso como Delta Table em: '{silver_swells_path}'")

# Verificação: lendo dados de volta da Delta Table
dt_swells = DeltaTable(silver_swells_path)
df_swells_verification = dt_swells.to_pandas()

print("\n=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===")
print(f"Registros: {len(df_swells_verification)}")
display(df_swells_verification.head())

print("\nEstatísticas dos swells individuais:")
print("Swell 1 (dominante) - Altura (metros):")
print(df_swells_verification['swell_1_height_m'].describe())

print("\nSwell 1 (dominante) - Período (segundos):")
print(df_swells_verification['swell_1_period_s'].describe())

print("\nSwell 2 (secundário) - Altura (metros):")
print(df_swells_verification['swell_2_height_m'].describe())

print("\nPoder total de todos os swells:")
print(df_swells_verification['total_swell_power'].describe())

print("\nDistribuição das categorias do swell dominante:")
print(df_swells_verification['primary_swell_category'].value_counts())

print("\nDistribuição das direções do swell dominante:")
print(df_swells_verification['primary_swell_direction'].value_counts())

print("\nTop 10 maiores swells dominantes registrados:")
top_swells = df_swells_verification.nlargest(10, 'swell_1_height_m')[['datetime_utc', 'swell_1_height_m', 'swell_1_period_s', 'primary_swell_direction', 'primary_swell_category']]
display(top_swells)

print("\nCorrelação entre altura e período do swell dominante:")
correlation = df_swells_verification['swell_1_height_m'].corr(df_swells_verification['swell_1_period_s'])
print(f"Correlação altura x período (swell 1): {correlation:.3f}")

print("\nComparação entre swells:")
print(f"Altura média - Swell 1: {df_swells_verification['swell_1_height_m'].mean():.2f}m")
print(f"Altura média - Swell 2: {df_swells_verification['swell_2_height_m'].mean():.2f}m")
print(f"Altura média - Swell 3: {df_swells_verification['swell_3_height_m'].mean():.2f}m")

✅ Tabela 'swells' salva com sucesso como Delta Table em: 'data/silver/swells'

=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===
Registros: 26304


Unnamed: 0,datetime_utc,timestamp_epoch,swell_1_height_m,swell_1_period_s,swell_1_direction_deg,swell_1_power,swell_1_impact,swell_2_height_m,swell_2_period_s,swell_2_direction_deg,...,swell_5_power,swell_5_impact,swell_6_height_m,swell_6_period_s,swell_6_direction_deg,swell_6_power,swell_6_impact,primary_swell_category,primary_swell_direction,total_swell_power
0,2020-01-01 06:00:00+00:00,1577847600,1.13461,6,51.30005,25.00685,0.28,0.81898,9,83.8667,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,79.52864
1,2020-01-01 07:00:00+00:00,1577851200,1.13461,6,51.30005,25.00685,0.28,0.81898,9,83.8667,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,79.52864
2,2020-01-01 08:00:00+00:00,1577854800,1.13461,6,51.30005,25.00685,0.28,0.81898,9,83.8667,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,79.52864
3,2020-01-01 09:00:00+00:00,1577858400,1.3966,8,63.56409,204.45764,0.7611,0.05794,7,150.1218,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,204.48899
4,2020-01-01 10:00:00+00:00,1577862000,1.3966,8,63.56409,204.45764,0.7611,0.05794,7,150.1218,...,0.0,0.0,0.0,0,0.0,0.0,0.0,Médio,Leste,204.48899



Estatísticas dos swells individuais:
Swell 1 (dominante) - Altura (metros):
count    26304.000000
mean         1.386435
std          0.546920
min          0.415600
25%          0.998000
50%          1.274130
75%          1.647933
max          4.840630
Name: swell_1_height_m, dtype: float64

Swell 1 (dominante) - Período (segundos):
count    26304.000000
mean         8.746807
std          2.080644
min          3.000000
25%          7.000000
50%          8.000000
75%         10.000000
max         16.000000
Name: swell_1_period_s, dtype: float64

Swell 2 (secundário) - Altura (metros):
count    26304.000000
mean         0.650505
std          0.317828
min          0.000000
25%          0.436925
50%          0.635785
75%          0.843337
max          2.594870
Name: swell_2_height_m, dtype: float64

Poder total de todos os swells:
count    26304.000000
mean       304.787407
std        436.954435
min          9.870890
25%         81.600593
50%        159.205785
75%        340.729658
max    

Unnamed: 0,datetime_utc,swell_1_height_m,swell_1_period_s,primary_swell_direction,primary_swell_category
22857,2022-08-10 15:00:00+00:00,4.84063,10,Leste,Gigante
22858,2022-08-10 16:00:00+00:00,4.84063,10,Leste,Gigante
22859,2022-08-10 17:00:00+00:00,4.84063,10,Leste,Gigante
22860,2022-08-10 18:00:00+00:00,4.69253,10,Leste,Gigante
22861,2022-08-10 19:00:00+00:00,4.69253,10,Leste,Gigante
22862,2022-08-10 20:00:00+00:00,4.69253,10,Leste,Gigante
22854,2022-08-10 12:00:00+00:00,4.33076,9,Leste,Gigante
22855,2022-08-10 13:00:00+00:00,4.33076,9,Leste,Gigante
22856,2022-08-10 14:00:00+00:00,4.33076,9,Leste,Gigante
22863,2022-08-10 21:00:00+00:00,4.32496,10,Leste,Gigante



Correlação entre altura e período do swell dominante:
Correlação altura x período (swell 1): 0.351

Comparação entre swells:
Altura média - Swell 1: 1.39m
Altura média - Swell 2: 0.65m
Altura média - Swell 3: 0.24m


# ETAPA 6: TIDES - Dados de Maré
## Transformando os dados de altura da maré e tipos de maré

In [28]:
# ==============================================================================
# CARREGAMENTO DOS DADOS DE TIDES
# ==============================================================================

# Carrega os dados de tides do arquivo parquet
df_tides_bronze = pd.read_parquet(os.path.join(BRONZE_DIR, 'surfline_bronze_tides.parquet'))

print("=== ANÁLISE INICIAL DOS DADOS DE TIDES ===")
print(f"Shape: {df_tides_bronze.shape}")
print(f"Período: {df_tides_bronze['datetime'].min()} até {df_tides_bronze['datetime'].max()}")
print("\nPrimeiras linhas:")
display(df_tides_bronze.head())

print("\nTipos de dados:")
print(df_tides_bronze.dtypes)

print("\nEstatísticas da altura da maré:")
print("Altura (metros):", df_tides_bronze['height'].describe())

print("\nTipos de maré:")
print(df_tides_bronze['type'].value_counts())

print("\nVariação da maré por dia:")
daily_tide_range = df_tides_bronze.groupby(df_tides_bronze['datetime'].dt.date)['height'].agg(['min', 'max', 'mean'])
daily_tide_range['range'] = daily_tide_range['max'] - daily_tide_range['min']
print("Amplitude média diária da maré:", daily_tide_range['range'].mean())
print("Maior amplitude registrada:", daily_tide_range['range'].max())

=== ANÁLISE INICIAL DOS DADOS DE TIDES ===
Shape: (32885, 5)
Período: 2020-01-01 00:00:00-03:00 até 2022-12-31 23:00:00-03:00

Primeiras linhas:


Unnamed: 0,timestamp,utcOffset,type,height,datetime
0,1579230000,-3,NORMAL,0.4,2020-01-17 00:00:00-03:00
1,1579233600,-3,NORMAL,0.36,2020-01-17 01:00:00-03:00
2,1579237200,-3,NORMAL,0.32,2020-01-17 02:00:00-03:00
3,1579240181,-3,LOW,0.3,2020-01-17 02:49:41-03:00
4,1579240800,-3,NORMAL,0.3,2020-01-17 03:00:00-03:00



Tipos de dados:
timestamp                                int64
utcOffset                                int64
type                                    object
height                                 float64
datetime     datetime64[ns, America/Sao_Paulo]
dtype: object

Estatísticas da altura da maré:
Altura (metros): count    32885.000000
mean         0.464787
std          0.210126
min         -0.100000
25%          0.310000
50%          0.470000
75%          0.630000
max          0.950000
Name: height, dtype: float64

Tipos de maré:
type
NORMAL    28004
HIGH       2442
LOW        2439
Name: count, dtype: int64

Variação da maré por dia:
Amplitude média diária da maré: 0.6086496350364963
Maior amplitude registrada: 0.97


In [30]:
# ==============================================================================
# TRANSFORMAÇÃO DOS DADOS DE TIDES COM SQL
# ==============================================================================

# Registra a tabela de tides no DuckDB
con.register('bronze_tides_raw', df_tides_bronze)

# Query para limpar e padronizar os dados de maré
query_stg_tides = """
SELECT
    -- Padroniza timestamp para UTC
    CAST(datetime AT TIME ZONE 'UTC' AS TIMESTAMPTZ) AS datetime_utc,
    
    -- Mantém timestamp original para referência
    CAST(timestamp AS BIGINT) as timestamp_epoch,
    
    -- Dados da maré padronizados
    CAST(height AS DOUBLE) as tide_height_m,
    type as tide_type,
    
    -- Classificação da altura da maré
    CASE 
        WHEN CAST(height AS DOUBLE) < -0.5 THEN 'Maré Muito Baixa'
        WHEN CAST(height AS DOUBLE) < 0.0 THEN 'Maré Baixa'
        WHEN CAST(height AS DOUBLE) < 0.5 THEN 'Maré Média Baixa'
        WHEN CAST(height AS DOUBLE) < 1.0 THEN 'Maré Média'
        WHEN CAST(height AS DOUBLE) < 1.5 THEN 'Maré Média Alta'
        WHEN CAST(height AS DOUBLE) < 2.0 THEN 'Maré Alta'
        ELSE 'Maré Muito Alta'
    END as tide_category,
    
    -- Classificação binária para surf (maré favorável ou não)
    CASE 
        WHEN CAST(height AS DOUBLE) BETWEEN 0.2 AND 1.2 THEN 'Favorável'
        ELSE 'Desfavorável'
    END as surf_condition,
    
    -- Tendência da maré (calculada com LAG)
    CASE 
        WHEN CAST(height AS DOUBLE) > LAG(CAST(height AS DOUBLE)) OVER (ORDER BY datetime) THEN 'Enchendo'
        WHEN CAST(height AS DOUBLE) < LAG(CAST(height AS DOUBLE)) OVER (ORDER BY datetime) THEN 'Secando'
        ELSE 'Estável'
    END as tide_trend,
    
    -- Velocidade de mudança da maré (metros por hora)
    CAST(height AS DOUBLE) - LAG(CAST(height AS DOUBLE)) OVER (ORDER BY datetime) as tide_change_rate_mh

FROM bronze_tides_raw
ORDER BY datetime_utc
"""

# Executa a query e salva o resultado
df_tides_silver = con.execute(query_stg_tides).fetch_df()

print("=== DADOS DE TIDES TRANSFORMADOS (SILVER) ===")
print(f"Shape: {df_tides_silver.shape}")
display(df_tides_silver.head())

print("\nInfo do DataFrame:")
df_tides_silver.info()

print("\nDistribuição das categorias de maré:")
print(df_tides_silver['tide_category'].value_counts())

print("\nDistribuição das condições para surf:")
print(df_tides_silver['surf_condition'].value_counts())

print("\nDistribuição das tendências de maré:")
print(df_tides_silver['tide_trend'].value_counts())

=== DADOS DE TIDES TRANSFORMADOS (SILVER) ===
Shape: (32885, 8)


Unnamed: 0,datetime_utc,timestamp_epoch,tide_height_m,tide_type,tide_category,surf_condition,tide_trend,tide_change_rate_mh
0,2020-01-01 03:00:00-03:00,1577847600,0.29,NORMAL,Maré Média Baixa,Favorável,Estável,
1,2020-01-01 04:00:00-03:00,1577851200,0.26,NORMAL,Maré Média Baixa,Favorável,Secando,-0.03
2,2020-01-01 04:41:56-03:00,1577853716,0.25,LOW,Maré Média Baixa,Favorável,Secando,-0.01
3,2020-01-01 05:00:00-03:00,1577854800,0.25,NORMAL,Maré Média Baixa,Favorável,Estável,0.0
4,2020-01-01 06:00:00-03:00,1577858400,0.33,NORMAL,Maré Média Baixa,Favorável,Enchendo,0.08



Info do DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32885 entries, 0 to 32884
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype                            
---  ------               --------------  -----                            
 0   datetime_utc         32885 non-null  datetime64[us, America/Sao_Paulo]
 1   timestamp_epoch      32885 non-null  int64                            
 2   tide_height_m        32885 non-null  float64                          
 3   tide_type            32885 non-null  object                           
 4   tide_category        32885 non-null  object                           
 5   surf_condition       32885 non-null  object                           
 6   tide_trend           32885 non-null  object                           
 7   tide_change_rate_mh  32884 non-null  float64                          
dtypes: datetime64[us, America/Sao_Paulo](1), float64(2), int64(1), object(4)
memory usage: 2.0+ MB

Distribuiçã

In [31]:
# ==============================================================================
# SALVANDO TIDES COMO DELTA TABLE
# ==============================================================================

# Define o caminho para a Delta Table de tides
silver_tides_path = os.path.join(SILVER_DIR, 'tides')

# Salva o DataFrame como uma Delta Table
write_deltalake(silver_tides_path, df_tides_silver, mode='overwrite')

print(f"✅ Tabela 'tides' salva com sucesso como Delta Table em: '{silver_tides_path}'")

# Verificação: lendo dados de volta da Delta Table
dt_tides = DeltaTable(silver_tides_path)
df_tides_verification = dt_tides.to_pandas()

print("\n=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===")
print(f"Registros: {len(df_tides_verification)}")
display(df_tides_verification.head())

print("\nEstatísticas da altura da maré:")
print(df_tides_verification['tide_height_m'].describe())

print("\nTipos de maré:")
print(df_tides_verification['tide_type'].value_counts())

print("\nCategorias de maré:")
print(df_tides_verification['tide_category'].value_counts())

print("\nCondições para surf:")
print(df_tides_verification['surf_condition'].value_counts())

print("\nTendências de maré:")
print(df_tides_verification['tide_trend'].value_counts())

print("\nEstatísticas da velocidade de mudança da maré:")
print("Taxa de mudança (m/h):")
print(df_tides_verification['tide_change_rate_mh'].describe())

print("\nMarés mais altas registradas:")
top_tides = df_tides_verification.nlargest(10, 'tide_height_m')[['datetime_utc', 'tide_height_m', 'tide_category', 'tide_trend', 'surf_condition']]
display(top_tides)

print("\nMarés mais baixas registradas:")
low_tides = df_tides_verification.nsmallest(10, 'tide_height_m')[['datetime_utc', 'tide_height_m', 'tide_category', 'tide_trend', 'surf_condition']]
display(low_tides)

✅ Tabela 'tides' salva com sucesso como Delta Table em: 'data/silver/tides'

=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===
Registros: 32885


Unnamed: 0,datetime_utc,timestamp_epoch,tide_height_m,tide_type,tide_category,surf_condition,tide_trend,tide_change_rate_mh
0,2020-01-01 06:00:00+00:00,1577847600,0.29,NORMAL,Maré Média Baixa,Favorável,Estável,
1,2020-01-01 07:00:00+00:00,1577851200,0.26,NORMAL,Maré Média Baixa,Favorável,Secando,-0.03
2,2020-01-01 07:41:56+00:00,1577853716,0.25,LOW,Maré Média Baixa,Favorável,Secando,-0.01
3,2020-01-01 08:00:00+00:00,1577854800,0.25,NORMAL,Maré Média Baixa,Favorável,Estável,0.0
4,2020-01-01 09:00:00+00:00,1577858400,0.33,NORMAL,Maré Média Baixa,Favorável,Enchendo,0.08



Estatísticas da altura da maré:
count    32885.000000
mean         0.464787
std          0.210126
min         -0.100000
25%          0.310000
50%          0.470000
75%          0.630000
max          0.950000
Name: tide_height_m, dtype: float64

Tipos de maré:
tide_type
NORMAL    28004
HIGH       2442
LOW        2439
Name: count, dtype: int64

Categorias de maré:
tide_category
Maré Média Baixa    17367
Maré Média          15242
Maré Baixa            276
Name: count, dtype: int64

Condições para surf:
surf_condition
Favorável       29087
Desfavorável     3798
Name: count, dtype: int64

Tendências de maré:
tide_trend
Secando     16030
Enchendo    11170
Estável      5685
Name: count, dtype: int64

Estatísticas da velocidade de mudança da maré:
Taxa de mudança (m/h):
count    32884.000000
mean         0.000009
std          0.087746
min         -0.190000
25%         -0.060000
50%          0.000000
75%          0.040000
max          0.290000
Name: tide_change_rate_mh, dtype: float64

Marés m

Unnamed: 0,datetime_utc,tide_height_m,tide_category,tide_trend,surf_condition
13583,2021-03-29 21:10:44+00:00,0.95,Maré Média,Enchendo,Favorável
24235,2022-03-19 21:24:51+00:00,0.95,Maré Média,Enchendo,Favorável
12702,2021-02-28 21:26:54+00:00,0.94,Maré Média,Enchendo,Favorável
13582,2021-03-29 21:00:00+00:00,0.94,Maré Média,Enchendo,Favorável
24206,2022-03-18 20:51:58+00:00,0.94,Maré Média,Enchendo,Favorável
24207,2022-03-18 21:00:00+00:00,0.94,Maré Média,Estável,Favorável
2070,2020-03-10 21:12:02+00:00,0.93,Maré Média,Enchendo,Favorável
2098,2020-03-11 21:43:54+00:00,0.93,Maré Média,Enchendo,Favorável
12730,2021-03-01 21:59:02+00:00,0.93,Maré Média,Enchendo,Favorável
12731,2021-03-01 22:00:00+00:00,0.93,Maré Média,Estável,Favorável



Marés mais baixas registradas:


Unnamed: 0,datetime_utc,tide_height_m,tide_category,tide_trend,surf_condition
7755,2020-09-16 15:00:00+00:00,-0.1,Maré Baixa,Secando,Desfavorável
7756,2020-09-16 15:04:12+00:00,-0.1,Maré Baixa,Estável,Desfavorável
7784,2020-09-17 15:40:48+00:00,-0.1,Maré Baixa,Secando,Desfavorável
29459,2022-09-09 14:57:48+00:00,-0.1,Maré Baixa,Secando,Desfavorável
29460,2022-09-09 15:00:00+00:00,-0.1,Maré Baixa,Estável,Desfavorável
29488,2022-09-10 15:34:05+00:00,-0.1,Maré Baixa,Secando,Desfavorável
7785,2020-09-17 16:00:00+00:00,-0.09,Maré Baixa,Enchendo,Desfavorável
28603,2022-08-12 15:54:47+00:00,-0.09,Maré Baixa,Secando,Desfavorável
28604,2022-08-12 16:00:00+00:00,-0.09,Maré Baixa,Estável,Desfavorável
17939,2021-08-22 15:33:51+00:00,-0.08,Maré Baixa,Secando,Desfavorável


# ETAPA 7: SUNLIGHT - Dados de Luz Solar ☀️
## Transformando os dados de nascer/pôr do sol e duração da luz solar (ÚLTIMA ETAPA!)

In [41]:
# ==============================================================================
# CARREGAMENTO DOS DADOS DE SUNLIGHT
# ==============================================================================

# Carrega os dados de sunlight do arquivo parquet
df_sunlight_bronze = pd.read_parquet(os.path.join(BRONZE_DIR, 'surfline_bronze_sunlight.parquet'))

print("=== ANÁLISE INICIAL DOS DADOS DE SUNLIGHT ===")
print(f"Shape: {df_sunlight_bronze.shape}")
print("\nPrimeiras linhas:")
display(df_sunlight_bronze.head())

print("\nTipos de dados:")
print(df_sunlight_bronze.dtypes)

print("\nColunas disponíveis:")
print(list(df_sunlight_bronze.columns))

# Análise das variações sazonais
print("\nAnálise temporal:")
df_sunlight_bronze['date'] = pd.to_datetime(df_sunlight_bronze['midnight']).dt.date
df_sunlight_bronze['month'] = pd.to_datetime(df_sunlight_bronze['midnight']).dt.month
print("Período dos dados:", df_sunlight_bronze['date'].min(), "até", df_sunlight_bronze['date'].max())
print("Registros por mês:", df_sunlight_bronze['month'].value_counts().sort_index())

=== ANÁLISE INICIAL DOS DADOS DE SUNLIGHT ===
Shape: (1096, 10)

Primeiras linhas:


Unnamed: 0,midnight,midnightUTCOffset,dawn,dawnUTCOffset,sunrise,sunriseUTCOffset,sunset,sunsetUTCOffset,dusk,duskUTCOffset
0,2020-01-17 00:00:00-03:00,-3,2020-01-17 05:08:52-03:00,-3,2020-01-17 05:34:44-03:00,-3,2020-01-17 19:14:24-03:00,-3,2020-01-17 19:40:16-03:00,-3
1,2020-01-18 00:00:00-03:00,-3,2020-01-18 05:09:44-03:00,-3,2020-01-18 05:35:33-03:00,-3,2020-01-18 19:14:15-03:00,-3,2020-01-18 19:40:04-03:00,-3
2,2020-01-19 00:00:00-03:00,-3,2020-01-19 05:10:37-03:00,-3,2020-01-19 05:36:22-03:00,-3,2020-01-19 19:14:04-03:00,-3,2020-01-19 19:39:50-03:00,-3
3,2020-01-20 00:00:00-03:00,-3,2020-01-20 05:11:30-03:00,-3,2020-01-20 05:37:12-03:00,-3,2020-01-20 19:13:52-03:00,-3,2020-01-20 19:39:34-03:00,-3
4,2020-01-21 00:00:00-03:00,-3,2020-01-21 05:12:23-03:00,-3,2020-01-21 05:38:02-03:00,-3,2020-01-21 19:13:38-03:00,-3,2020-01-21 19:39:18-03:00,-3



Tipos de dados:
midnight             datetime64[ns, America/Sao_Paulo]
midnightUTCOffset                                int64
dawn                 datetime64[ns, America/Sao_Paulo]
dawnUTCOffset                                    int64
sunrise              datetime64[ns, America/Sao_Paulo]
sunriseUTCOffset                                 int64
sunset               datetime64[ns, America/Sao_Paulo]
sunsetUTCOffset                                  int64
dusk                 datetime64[ns, America/Sao_Paulo]
duskUTCOffset                                    int64
dtype: object

Colunas disponíveis:
['midnight', 'midnightUTCOffset', 'dawn', 'dawnUTCOffset', 'sunrise', 'sunriseUTCOffset', 'sunset', 'sunsetUTCOffset', 'dusk', 'duskUTCOffset']

Análise temporal:
Período dos dados: 2020-01-01 até 2022-12-31
Registros por mês: month
1     93
2     85
3     93
4     90
5     93
6     90
7     93
8     93
9     90
10    93
11    90
12    93
Name: count, dtype: int64


In [42]:
# ==============================================================================
# TRANSFORMAÇÃO DOS DADOS DE SUNLIGHT COM SQL
# ==============================================================================

# Registra a tabela de sunlight no DuckDB
con.register('bronze_sunlight_raw', df_sunlight_bronze)

# Query para limpar e padronizar os dados de luz solar
query_stg_sunlight = """
SELECT
    -- Data de referência (meia-noite como referência do dia)
    CAST(midnight AS TIMESTAMPTZ) AS date_utc,
    CAST(midnight AS DATE) as date_local,
    
    -- Timestamps de eventos solares (já estão em timezone local)
    CAST(dawn AS TIMESTAMPTZ) AS dawn_utc,
    CAST(sunrise AS TIMESTAMPTZ) AS sunrise_utc,
    CAST(sunset AS TIMESTAMPTZ) AS sunset_utc,
    CAST(dusk AS TIMESTAMPTZ) AS dusk_utc,
    
    -- Mantém horários locais para referência
    dawn as dawn_local,
    sunrise as sunrise_local,
    sunset as sunset_local,
    dusk as dusk_local,
    
    -- Calcula durações em horas
    EXTRACT(EPOCH FROM (sunset - sunrise)) / 3600.0 as daylight_duration_hours,
    EXTRACT(EPOCH FROM (dusk - dawn)) / 3600.0 as civil_twilight_duration_hours,
    
    -- Calcula horários do meio-dia solar (ponto médio entre sunrise e sunset)
    sunrise + (sunset - sunrise) / 2 as solar_noon_utc,
    
    -- Classificação da duração do dia
    CASE 
        WHEN EXTRACT(EPOCH FROM (sunset - sunrise)) / 3600.0 < 10 THEN 'Dia Curto'
        WHEN EXTRACT(EPOCH FROM (sunset - sunrise)) / 3600.0 < 12 THEN 'Dia Normal'
        WHEN EXTRACT(EPOCH FROM (sunset - sunrise)) / 3600.0 < 14 THEN 'Dia Longo'
        ELSE 'Dia Muito Longo'
    END as daylight_category,
    
    -- Condições para surf baseadas na luz
    CASE 
        WHEN EXTRACT(EPOCH FROM (sunset - sunrise)) / 3600.0 > 11 THEN 'Bom para Surf'
        ELSE 'Limitado para Surf'
    END as surf_light_condition,
    
    -- Mês para análises sazonais
    EXTRACT(MONTH FROM midnight) as month,
    
    -- Estação do ano (hemisfério sul)
    CASE 
        WHEN EXTRACT(MONTH FROM midnight) IN (12, 1, 2) THEN 'Verão'
        WHEN EXTRACT(MONTH FROM midnight) IN (3, 4, 5) THEN 'Outono'
        WHEN EXTRACT(MONTH FROM midnight) IN (6, 7, 8) THEN 'Inverno'
        ELSE 'Primavera'
    END as season

FROM bronze_sunlight_raw
ORDER BY date_utc
"""

# Executa a query e salva o resultado
df_sunlight_silver = con.execute(query_stg_sunlight).fetch_df()

print("=== DADOS DE SUNLIGHT TRANSFORMADOS (SILVER) ===")
print(f"Shape: {df_sunlight_silver.shape}")
display(df_sunlight_silver.head())

print("\nInfo do DataFrame:")
df_sunlight_silver.info()

print("\nDistribuição por estação:")
print(df_sunlight_silver['season'].value_counts())

print("\nDistribuição da duração da luz:")
print(df_sunlight_silver['daylight_category'].value_counts())

print("\nCondições para surf:")
print(df_sunlight_silver['surf_light_condition'].value_counts())

=== DADOS DE SUNLIGHT TRANSFORMADOS (SILVER) ===
Shape: (1096, 17)


Unnamed: 0,date_utc,date_local,dawn_utc,sunrise_utc,sunset_utc,dusk_utc,dawn_local,sunrise_local,sunset_local,dusk_local,daylight_duration_hours,civil_twilight_duration_hours,solar_noon_utc,daylight_category,surf_light_condition,month,season
0,2020-01-01 00:00:00-03:00,2020-01-01,2020-01-01 04:56:10-03:00,2020-01-01 05:22:38-03:00,2020-01-01 19:13:25-03:00,2020-01-01 19:39:53-03:00,2020-01-01 04:56:10-03:00,2020-01-01 05:22:38-03:00,2020-01-01 19:13:25-03:00,2020-01-01 19:39:53-03:00,13.846389,14.728611,2020-01-01 12:18:01.500000-03:00,Dia Longo,Bom para Surf,1,Verão
1,2020-01-02 00:00:00-03:00,2020-01-02,2020-01-02 04:56:52-03:00,2020-01-02 05:23:18-03:00,2020-01-02 19:13:40-03:00,2020-01-02 19:40:06-03:00,2020-01-02 04:56:52-03:00,2020-01-02 05:23:18-03:00,2020-01-02 19:13:40-03:00,2020-01-02 19:40:06-03:00,13.839444,14.720556,2020-01-02 12:18:29-03:00,Dia Longo,Bom para Surf,1,Verão
2,2020-01-03 00:00:00-03:00,2020-01-03,2020-01-03 04:57:34-03:00,2020-01-03 05:23:59-03:00,2020-01-03 19:13:53-03:00,2020-01-03 19:40:18-03:00,2020-01-03 04:57:34-03:00,2020-01-03 05:23:59-03:00,2020-01-03 19:13:53-03:00,2020-01-03 19:40:18-03:00,13.831667,14.712222,2020-01-03 12:18:56-03:00,Dia Longo,Bom para Surf,1,Verão
3,2020-01-04 00:00:00-03:00,2020-01-04,2020-01-04 04:58:18-03:00,2020-01-04 05:24:41-03:00,2020-01-04 19:14:05-03:00,2020-01-04 19:40:28-03:00,2020-01-04 04:58:18-03:00,2020-01-04 05:24:41-03:00,2020-01-04 19:14:05-03:00,2020-01-04 19:40:28-03:00,13.823333,14.702778,2020-01-04 12:19:23-03:00,Dia Longo,Bom para Surf,1,Verão
4,2020-01-05 00:00:00-03:00,2020-01-05,2020-01-05 04:59:03-03:00,2020-01-05 05:25:24-03:00,2020-01-05 19:14:16-03:00,2020-01-05 19:40:37-03:00,2020-01-05 04:59:03-03:00,2020-01-05 05:25:24-03:00,2020-01-05 19:14:16-03:00,2020-01-05 19:40:37-03:00,13.814444,14.692778,2020-01-05 12:19:50-03:00,Dia Longo,Bom para Surf,1,Verão



Info do DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype                            
---  ------                         --------------  -----                            
 0   date_utc                       1096 non-null   datetime64[us, America/Sao_Paulo]
 1   date_local                     1096 non-null   datetime64[us]                   
 2   dawn_utc                       1096 non-null   datetime64[us, America/Sao_Paulo]
 3   sunrise_utc                    1096 non-null   datetime64[us, America/Sao_Paulo]
 4   sunset_utc                     1096 non-null   datetime64[us, America/Sao_Paulo]
 5   dusk_utc                       1096 non-null   datetime64[us, America/Sao_Paulo]
 6   dawn_local                     1096 non-null   datetime64[us, America/Sao_Paulo]
 7   sunrise_local                  1096 non-null   datetime64[us, America/Sao_Paulo]
 8   sunset_l

In [44]:
# ==============================================================================
# SALVANDO SUNLIGHT COMO DELTA TABLE
# ==============================================================================

# Define o caminho para a Delta Table de sunlight
silver_sunlight_path = os.path.join(SILVER_DIR, 'sunlight')

# Salva o DataFrame como uma Delta Table
write_deltalake(silver_sunlight_path, df_sunlight_silver, mode='overwrite')

print(f"✅ Tabela 'sunlight' salva com sucesso como Delta Table em: '{silver_sunlight_path}'")

# Verificação: lendo dados de volta da Delta Table
dt_sunlight = DeltaTable(silver_sunlight_path)
df_sunlight_verification = dt_sunlight.to_pandas()

print("\n=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===")
print(f"Registros: {len(df_sunlight_verification)}")
display(df_sunlight_verification.head())

print("\nEstatísticas da duração da luz solar:")
print("Duração da luz do dia (horas):")
print(df_sunlight_verification['daylight_duration_hours'].describe())

print("\nDuração do crepúsculo civil (horas):")
print(df_sunlight_verification['civil_twilight_duration_hours'].describe())

print("\nDistribuição por estação:")
print(df_sunlight_verification['season'].value_counts())

print("\nCategorias de duração da luz:")
print(df_sunlight_verification['daylight_category'].value_counts())

print("\nCondições para surf baseadas na luz:")
print(df_sunlight_verification['surf_light_condition'].value_counts())

print("\nVariação sazonal da duração da luz:")
seasonal_light = df_sunlight_verification.groupby('season')['daylight_duration_hours'].agg(['mean', 'min', 'max'])
display(seasonal_light)

print("\nDias com maior duração de luz (verão):")
longest_days = df_sunlight_verification.nlargest(5, 'daylight_duration_hours')[['date_local', 'daylight_duration_hours', 'season', 'daylight_category']]
display(longest_days)

print("\nDias com menor duração de luz (inverno):")
shortest_days = df_sunlight_verification.nsmallest(5, 'daylight_duration_hours')[['date_local', 'daylight_duration_hours', 'season', 'daylight_category']]
display(shortest_days)

✅ Tabela 'sunlight' salva com sucesso como Delta Table em: 'data/silver/sunlight'

=== VERIFICAÇÃO - LENDO DA DELTA TABLE ===
Registros: 1096


Unnamed: 0,date_utc,date_local,dawn_utc,sunrise_utc,sunset_utc,dusk_utc,dawn_local,sunrise_local,sunset_local,dusk_local,daylight_duration_hours,civil_twilight_duration_hours,solar_noon_utc,daylight_category,surf_light_condition,month,season
0,2020-01-01 03:00:00+00:00,2020-01-01,2020-01-01 07:56:10+00:00,2020-01-01 08:22:38+00:00,2020-01-01 22:13:25+00:00,2020-01-01 22:39:53+00:00,2020-01-01 07:56:10+00:00,2020-01-01 08:22:38+00:00,2020-01-01 22:13:25+00:00,2020-01-01 22:39:53+00:00,13.846389,14.728611,2020-01-01 15:18:01.500000+00:00,Dia Longo,Bom para Surf,1,Verão
1,2020-01-02 03:00:00+00:00,2020-01-02,2020-01-02 07:56:52+00:00,2020-01-02 08:23:18+00:00,2020-01-02 22:13:40+00:00,2020-01-02 22:40:06+00:00,2020-01-02 07:56:52+00:00,2020-01-02 08:23:18+00:00,2020-01-02 22:13:40+00:00,2020-01-02 22:40:06+00:00,13.839444,14.720556,2020-01-02 15:18:29+00:00,Dia Longo,Bom para Surf,1,Verão
2,2020-01-03 03:00:00+00:00,2020-01-03,2020-01-03 07:57:34+00:00,2020-01-03 08:23:59+00:00,2020-01-03 22:13:53+00:00,2020-01-03 22:40:18+00:00,2020-01-03 07:57:34+00:00,2020-01-03 08:23:59+00:00,2020-01-03 22:13:53+00:00,2020-01-03 22:40:18+00:00,13.831667,14.712222,2020-01-03 15:18:56+00:00,Dia Longo,Bom para Surf,1,Verão
3,2020-01-04 03:00:00+00:00,2020-01-04,2020-01-04 07:58:18+00:00,2020-01-04 08:24:41+00:00,2020-01-04 22:14:05+00:00,2020-01-04 22:40:28+00:00,2020-01-04 07:58:18+00:00,2020-01-04 08:24:41+00:00,2020-01-04 22:14:05+00:00,2020-01-04 22:40:28+00:00,13.823333,14.702778,2020-01-04 15:19:23+00:00,Dia Longo,Bom para Surf,1,Verão
4,2020-01-05 03:00:00+00:00,2020-01-05,2020-01-05 07:59:03+00:00,2020-01-05 08:25:24+00:00,2020-01-05 22:14:16+00:00,2020-01-05 22:40:37+00:00,2020-01-05 07:59:03+00:00,2020-01-05 08:25:24+00:00,2020-01-05 22:14:16+00:00,2020-01-05 22:40:37+00:00,13.814444,14.692778,2020-01-05 15:19:50+00:00,Dia Longo,Bom para Surf,1,Verão



Estatísticas da duração da luz solar:
Duração da luz do dia (horas):
count    1096.000000
mean       12.105051
std         1.201856
min        10.399167
25%        10.934028
50%        12.075139
75%        13.272083
max        13.881111
Name: daylight_duration_hours, dtype: float64

Duração do crepúsculo civil (horas):
count    1096.000000
mean       12.926851
std         1.211882
min        11.254722
25%        11.746806
50%        12.852083
75%        14.098889
max        14.767222
Name: civil_twilight_duration_hours, dtype: float64

Distribuição por estação:
season
Outono       276
Inverno      276
Primavera    273
Verão        271
Name: count, dtype: int64

Categorias de duração da luz:
daylight_category
Dia Longo     564
Dia Normal    532
Name: count, dtype: int64

Condições para surf baseadas na luz:
surf_light_condition
Bom para Surf         803
Limitado para Surf    293
Name: count, dtype: int64

Variação sazonal da duração da luz:


Unnamed: 0_level_0,mean,min,max
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Inverno,10.732735,10.399167,11.535278
Outono,11.475293,10.518611,12.656389
Primavera,12.721747,11.5475,13.735833
Verão,13.522819,12.669444,13.881111



Dias com maior duração de luz (verão):


Unnamed: 0,date_local,daylight_duration_hours,season,daylight_category
720,2021-12-21,13.881111,Verão,Dia Longo
355,2020-12-21,13.880833,Verão,Dia Longo
356,2020-12-22,13.880833,Verão,Dia Longo
721,2021-12-22,13.880833,Verão,Dia Longo
1085,2022-12-21,13.880833,Verão,Dia Longo



Dias com menor duração de luz (inverno):


Unnamed: 0,date_local,daylight_duration_hours,season,daylight_category
171,2020-06-20,10.399167,Inverno,Dia Normal
537,2021-06-21,10.399167,Inverno,Dia Normal
902,2022-06-21,10.399167,Inverno,Dia Normal
172,2020-06-21,10.399444,Inverno,Dia Normal
538,2021-06-22,10.399444,Inverno,Dia Normal


# 🎉 CAMADA SILVER COMPLETA! 
## Resumo das 7 tabelas criadas na camada silver

In [46]:
# ==============================================================================
# RESUMO FINAL DA CAMADA SILVER
# ==============================================================================

print("🎉" * 50)
print("               CAMADA SILVER COMPLETA!")
print("🎉" * 50)

print("\n✅ TABELAS DELTA CRIADAS NA CAMADA SILVER:")
print("=" * 60)

# Lista todas as tabelas criadas
import os
silver_tables = []
for item in os.listdir(SILVER_DIR):
    if os.path.isdir(os.path.join(SILVER_DIR, item)):
        # Verifica se é uma Delta Table válida
        delta_log_path = os.path.join(SILVER_DIR, item, '_delta_log')
        if os.path.exists(delta_log_path):
            table_path = os.path.join(SILVER_DIR, item)
            dt = DeltaTable(table_path)
            df = dt.to_pandas()
            silver_tables.append({
                'Tabela': item,
                'Registros': len(df),
                'Colunas': len(df.columns),
                'Caminho': table_path
            })

# Exibe o resumo
for i, table in enumerate(silver_tables, 1):
    print(f"{i}. 📊 {table['Tabela'].upper()}")
    print(f"   Registros: {table['Registros']:,}")
    print(f"   Colunas: {table['Colunas']}")
    print(f"   Caminho: {table['Caminho']}")
    print()

print("=" * 60)
print("🏄‍♂️ DADOS PRONTOS PARA ANÁLISES DE SURF!")
print("📈 PRÓXIMOS PASSOS: Criar a camada GOLD com métricas de negócio")
print("=" * 60)

# Estatísticas gerais
total_records = sum([table['Registros'] for table in silver_tables])
total_columns = sum([table['Colunas'] for table in silver_tables])

print(f"\n📊 ESTATÍSTICAS GERAIS:")
print(f"   Total de tabelas: {len(silver_tables)}")
print(f"   Total de registros: {total_records:,}")
print(f"   Total de colunas: {total_columns}")
print(f"   Período dos dados: 2020-01-01 até 2022-12-31")
print(f"   Formato: Delta Lake")
print(f"   Granularidade temporal: Hora em hora")

print("\n🌊 READY TO SURF THE DATA! 🏄‍♂️")

🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉
               CAMADA SILVER COMPLETA!
🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉

✅ TABELAS DELTA CRIADAS NA CAMADA SILVER:
1. 📊 RATING
   Registros: 26,304
   Colunas: 4
   Caminho: data/silver/rating

2. 📊 SURF
   Registros: 26,304
   Colunas: 11
   Caminho: data/silver/surf

3. 📊 WIND
   Registros: 26,496
   Colunas: 7
   Caminho: data/silver/wind

4. 📊 SUNLIGHT
   Registros: 1,096
   Colunas: 17
   Caminho: data/silver/sunlight

5. 📊 TIDES
   Registros: 32,885
   Colunas: 8
   Caminho: data/silver/tides

6. 📊 METEOMATICS
   Registros: 26,305
   Colunas: 9
   Caminho: data/silver/meteomatics

7. 📊 SWELLS
   Registros: 26,304
   Colunas: 35
   Caminho: data/silver/swells

🏄‍♂️ DADOS PRONTOS PARA ANÁLISES DE SURF!
📈 PRÓXIMOS PASSOS: Criar a camada GOLD com métricas de negócio

📊 ESTATÍSTICAS GERAIS:
   Total de tabelas: 7
   Total de registros: 165,694
   Total de colunas: 91
   Período dos dados: 2020-01-01 até 2022-12-31
  

# 🏗️ ARQUITETURA DE DADOS: Silver vs Gold
## Por que não fizemos Star Schema na Silver?

### ✅ **SILVER LAYER (O que fizemos - CORRETO!)**
- **Objetivo**: Dados limpos e padronizados por fonte
- **Estrutura**: Uma tabela por sistema source (7 tabelas)
- **Características**:
  - Dados normalizados e tipados
  - Limpeza de dados aplicada
  - Mantém granularidade original
  - "Source-centric" (centrado na fonte)

### 🌟 **GOLD LAYER (Próximo passo - Star Schema!)**
- **Objetivo**: Dados modelados para análise de negócio
- **Estrutura**: Star Schema otimizado para BI
- **Modelo sugerido**:
  - `fact_surf_conditions` (tabela fato com métricas horárias)
  - `dim_date` (calendário, estações, feriados)
  - `dim_location` (praia, região, coordenadas)
  - `dim_weather` (categorias climáticas)
  - `dim_swell` (classificações de ondulação)
  
### 🎯 **Por que essa separação?**
1. **Silver**: Facilita manutenção e troubleshooting por fonte
2. **Gold**: Otimizado para consultas analíticas e BI
3. **Flexibilidade**: Podemos criar múltiplos marts na Gold para diferentes casos de uso

# 🌟 PROPOSTA: Modelo Star Schema para Camada GOLD

## 📊 **FACT_SURF_CONDITIONS** (Tabela Fato Principal)
```sql
-- Granularidade: 1 registro por hora
-- Fonte: JOIN de todas as 7 tabelas Silver
COLUMNS:
- datetime_utc (PK)
- location_key (FK -> dim_location)
- date_key (FK -> dim_date) 
- weather_key (FK -> dim_weather)
- swell_key (FK -> dim_swell)

-- MÉTRICAS DE SURF
- wave_height_avg_m
- wave_height_max_m  
- rating_score_normalized
- primary_swell_height_m
- primary_swell_period_s
- tide_height_m
- wind_speed_kph
- temperature_celsius
- daylight_duration_hours

-- MÉTRICAS CALCULADAS
- surf_quality_index (0-100)
- optimal_surf_flag (boolean)
- session_potential_score
```

## 🗓️ **DIM_DATE** (Dimensão Temporal)
```sql
COLUMNS:
- date_key (PK)
- full_date
- year, month, day
- quarter, week_of_year
- day_of_week, day_name
- is_weekend, is_holiday
- season (hemisfério sul)
- surf_season_category
```

## 🌊 **DIM_SWELL** (Dimensão Ondulação)
```sql
COLUMNS:
- swell_key (PK)
- primary_height_category
- primary_direction_category  
- period_range_category
- swell_combination_type
- surf_potential_rating
```

## 🌤️ **DIM_WEATHER** (Dimensão Clima)
```sql
COLUMNS:
- weather_key (PK)
- wind_condition_category
- temperature_category
- pressure_category
- weather_combination_rating
```

# 🎯 **VANTAGENS DA NOSSA ABORDAGEM**

## ✅ **Silver Layer (Source-Centric)**
- **Troubleshooting**: Fácil identificar problemas por fonte
- **Manutenção**: Updates independentes por sistema
- **Auditoria**: Rastreabilidade completa dos dados
- **Flexibilidade**: Base sólida para múltiplos modelos Gold

## 🌟 **Gold Layer (Business-Centric)**  
- **Performance**: Otimizado para consultas analíticas
- **Usabilidade**: Modelo intuitivo para analistas
- **BI Ready**: Direto para Tableau, Power BI, etc.
- **Métricas**: KPIs de negócio pré-calculados

---

# 🚀 **PRÓXIMOS PASSOS**

## 1. **Criar notebook `gold.ipynb`**
- Implementar o Star Schema proposto
- JOINs das 7 tabelas Silver
- Cálculo de métricas de negócio

## 2. **Métricas de Surf a Desenvolver**
- 🏄‍♂️ **Surf Quality Index**: Score 0-100 baseado em múltiplas variáveis
- ⭐ **Session Rating**: Classificação da sessão (Ruim/Bom/Épico)
- 🌊 **Optimal Conditions**: Flag para condições ideais
- 📊 **Trend Analysis**: Identificação de padrões sazonais

## 3. **Data Marts Especializados**
- `mart_daily_summary`: Resumos diários
- `mart_surf_forecast`: Dados para modelo de ML
- `mart_analytics_dashboard`: Dados para BI

**Quer que eu crie o notebook Gold agora?** 🏄‍♂️