## Processamento de dados coletados

### Teste local

In [115]:
# == importando bibliotecas ==

import pandas as pd
import json
from datetime import timedelta

In [116]:
# == abrindo arquivo json ==

with open('sao-carlos-2023-10-15-15-30.json') as file:
    data = json.load(file)

In [117]:
# == criando dataframes "real" e "previsão" ==

results = data['results']
forecast = results['forecast']

df_real = pd.DataFrame({
    'date': [results['date']],
    'time': [results['time']],
    'temperature': [results['temp']],
    'description': [results['description']],
    'humidity': [results['humidity']],
    'cloudiness': [results['cloudiness']],
    'rain': [results['rain']],
    'wind_speedy': [results['wind_speedy']]
})

df_forecast = pd.DataFrame({
    'real_date': results['date'],
    'date': results['date'],
    'max_temperature': [item['max'] for item in forecast],
    'min_temperature': [item['min'] for item in forecast],
    'cloudiness': [item['cloudiness'] for item in forecast],
    'rain': [item['rain'] for item in forecast],
    'rain_probability': [item['rain_probability'] for item in forecast],
    'wind_speedy': [item['wind_speedy'] for item in forecast],
    'description': [item['description'] for item in forecast],
    'condition': [item['condition'] for item in forecast]
})

In [118]:
# == arrumando dados das colunas wind_speedy ==

df_real['wind_speedy'] = df_real['wind_speedy'].str.extract(r'(\d+\.\d+)')
df_forecast['wind_speedy'] = df_forecast['wind_speedy'].str.extract(r'(\d+\.\d+)')

In [119]:
# == alterando os tipos das colunas ==

df_real['datetime'] = pd.to_datetime(df_real['date'] + ' ' + df_real['time'], dayfirst=True)
df_real['temperature'] = df_real['temperature'].astype(float)
df_real['humidity'] = df_real['humidity'].astype(float)
df_real['wind_speedy'] = df_real['wind_speedy'].astype(float)

df_forecast['real_date'] = pd.to_datetime(df_forecast['real_date'], format='%d/%m/%Y')
df_forecast['date'] = pd.to_datetime(df_forecast['date'], format='%d/%m/%Y')
df_forecast['max_temperature'] = df_forecast['max_temperature'].astype(float)
df_forecast['min_temperature'] = df_forecast['min_temperature'].astype(float)
df_forecast['rain_probability'] = df_forecast['rain_probability'].astype(float)
df_forecast['wind_speedy'] = df_forecast['wind_speedy'].astype(float)

In [120]:
# == arrumando dados da coluna date ==

first_date = df_forecast['date'].iloc[0]
for i in range(len(df_forecast)):
    df_forecast.loc[i, 'date'] = first_date + timedelta(days=i)

In [121]:
# == dropando colunas desnecessárias ==

df_real.drop(columns=['date', 'time'], inplace=True)

In [122]:
df_real

Unnamed: 0,temperature,description,humidity,cloudiness,rain,wind_speedy,datetime
0,28.0,Tempo limpo,52.0,10.0,0.0,3.3,2023-10-15 12:09:00


In [123]:
df_forecast

Unnamed: 0,real_date,date,max_temperature,min_temperature,cloudiness,rain,rain_probability,wind_speedy,description,condition
0,2023-10-15,2023-10-15,30.0,17.0,6.0,1.23,73.0,5.19,Chuvas esparsas,rain
1,2023-10-15,2023-10-16,33.0,19.0,0.0,0.62,66.0,5.18,Chuvas esparsas,rain
2,2023-10-15,2023-10-17,29.0,21.0,100.0,0.15,32.0,3.98,Chuvas esparsas,rain
3,2023-10-15,2023-10-18,33.0,18.0,17.0,9.13,98.0,9.64,Chuva,rain
4,2023-10-15,2023-10-19,28.0,17.0,57.0,5.76,99.0,8.49,Chuvas esparsas,rain
5,2023-10-15,2023-10-20,24.0,16.0,100.0,2.0,98.0,3.62,Chuvas esparsas,rain
6,2023-10-15,2023-10-21,31.0,16.0,0.0,0.0,0.0,6.13,Tempo limpo,clear_day
7,2023-10-15,2023-10-22,34.0,17.0,0.0,0.0,0.0,4.38,Tempo limpo,clear_day
8,2023-10-15,2023-10-23,37.0,21.0,65.0,0.23,30.0,6.12,Chuvas esparsas,rain
9,2023-10-15,2023-10-24,30.0,20.0,85.0,15.83,81.0,4.1,Chuva,rain


### Cloud Function

In [None]:
# REQUIREMENTS
# functions-framework==3.*
# pandas==2.0.2
# gcsfs==2023.6.0

In [None]:
import functions_framework
import pandas as pd
import json
from datetime import timedelta
from google.cloud import storage

# Triggered from a message on a Cloud Pub/Sub topic.
@functions_framework.cloud_event
def process_sao_carlos_weather(cloud_event=None):

    bucket_name = cloud_event.data['message']['attributes']['bucketId']
    obj = cloud_event.data['message']['attributes']['objectId']
    uri = 'gs://' + bucket_name + '/' + obj

    # If for some reason function is triggered with another file abort
    if 'bronze/sao-carlos' not in obj:
        print('Not the expected file')
        return {'Message': 'Not the expected file'}
    
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(obj)
    json_data = blob.download_as_text()
    data = json.loads(json_data)

    results = data['results']
    forecast = results['forecast']

    df_real = pd.DataFrame({
        'date': [results['date']],
        'time': [results['time']],
        'temperature': [results['temp']],
        'description': [results['description']],
        'humidity': [results['humidity']],
        'cloudiness': [results['cloudiness']],
        'rain': [results['rain']],
        'wind_speedy': [results['wind_speedy']]
    })

    df_forecast = pd.DataFrame({
        'real_date': results['date'],
        'date': results['date'],
        'max_temperature': [item['max'] for item in forecast],
        'min_temperature': [item['min'] for item in forecast],
        'cloudiness': [item['cloudiness'] for item in forecast],
        'rain': [item['rain'] for item in forecast],
        'rain_probability': [item['rain_probability'] for item in forecast],
        'wind_speedy': [item['wind_speedy'] for item in forecast],
        'description': [item['description'] for item in forecast],
        'condition': [item['condition'] for item in forecast]
    })

    # == wind_speedy ==
    df_real['wind_speedy'] = df_real['wind_speedy'].str.extract(r'(\d+\.\d+)')
    df_forecast['wind_speedy'] = df_forecast['wind_speedy'].str.extract(r'(\d+\.\d+)')

    # == types of columns ==
    df_real['datetime'] = pd.to_datetime(df_real['date'] + ' ' + df_real['time'], dayfirst=True)
    df_real['temperature'] = df_real['temperature'].astype(float)
    df_real['humidity'] = df_real['humidity'].astype(float)
    df_real['wind_speedy'] = df_real['wind_speedy'].astype(float)
    df_forecast['real_date'] = pd.to_datetime(df_forecast['real_date'], format='%d/%m/%Y')
    df_forecast['date'] = pd.to_datetime(df_forecast['date'], format='%d/%m/%Y')
    df_forecast['max_temperature'] = df_forecast['max_temperature'].astype(float)
    df_forecast['min_temperature'] = df_forecast['min_temperature'].astype(float)
    df_forecast['rain_probability'] = df_forecast['rain_probability'].astype(float)
    df_forecast['wind_speedy'] = df_forecast['wind_speedy'].astype(float)

    # == date ==
    first_date = df_forecast['date'].iloc[0]
    for i in range(len(df_forecast)):
        df_forecast.loc[i, 'date'] = first_date + timedelta(days=i)

    # == drop columns ==
    df_real.drop(columns=['date', 'time'], inplace=True)

    # == save dataframes ==
    name_file = obj.split('/')[-1]
    name_file = name_file.replace('json', 'csv')
    df_real.to_csv(f'gs://{bucket_name}/silver/real/real-{name_file}', index=False)
    df_forecast.to_csv(f'gs://{bucket_name}/silver/forecast/forecast-{name_file}', index=False)


In [None]:
# gcloud storage buckets notifications create gs://dados-api --topic=process-data-api --event-types=OBJECT_FINALIZE --object-prefix=bronze/sao-carlos