<a href="https://colab.research.google.com/github/anscch/ATPAD-UNAM/blob/main/METEO_ATPAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#***METEO_ATPAD***

#***DATASET CONSTRUCTION***

Parquet files contains all data from each station. This files will need to be updated as new information is available. Updated versions of such files can be builted using PREP_ATPAT. New data will be needed in METEO_RAW directory.

First cleaning steps are carried out by concatenating all station's files into a single sataset since all parameters are the same between stations.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
connection = duckdb.connect()
import os
import datetime
import calendar

mor_data_raw = pd.read_parquet('/content/drive/MyDrive/ATPAD_COLAB/PARQUET_FILES/RUOA_MORE_1h_Meteo_2024.parquet').reset_index(drop=True)
sal_data_raw = pd.read_parquet('/content/drive/MyDrive/ATPAD_COLAB/PARQUET_FILES/RUOA_SLLO_1h_Meteo_2024.parquet').reset_index(drop=True)
agu_data_raw = pd.read_parquet('/content/drive/MyDrive/ATPAD_COLAB/PARQUET_FILES/RUOA_AGSC_1h_Meteo_2024.parquet').reset_index(drop=True)

#filling gaps in timestamp

dfs = [mor_data_raw, sal_data_raw, agu_data_raw]

for i in range(3):
    full_range = pd.date_range(start=dfs[i]['Time'].min(), end=dfs[i]['Time'].max(), freq='h')
    dfs[i]= dfs[i].set_index('Time')
    dfs[i]= dfs[i].reindex(full_range)

dfs[0]['region'] = 'Morelia'
dfs[1]['region'] = 'Saltillo'
dfs[2]['region'] = 'Aguascalientes'

# All regions together

meteo_raw = pd.concat(dfs).reset_index(drop=False)
meteo_raw = meteo_raw.rename(columns={'index': 'Time'})
meteo_raw

Unnamed: 0,Time,Temp_Avg,RH_Avg,WSpeed_Avg,WSpeed_Max,WDir_Avg,Rain_Tot,Press_Avg,Rad_Avg,region
0,2015-08-01 00:00:00,,,,,,,,,Morelia
1,2015-08-01 01:00:00,,,,,,,,,Morelia
2,2015-08-01 02:00:00,,,,,,,,,Morelia
3,2015-08-01 03:00:00,,,,,,,,,Morelia
4,2015-08-01 04:00:00,,,,,,,,,Morelia
...,...,...,...,...,...,...,...,...,...,...
248494,2024-11-10 23:00:00,21.63,37.71,1.317,3.25,3.0,0.0,817.400,-1.165,Aguascalientes
248495,2024-11-11 00:00:00,20.20,42.13,1.414,3.34,8.6,0.0,817.702,-0.268,Aguascalientes
248496,2024-11-11 01:00:00,19.39,44.20,2.281,4.22,28.3,0.0,817.713,-0.553,Aguascalientes
248497,2024-11-11 02:00:00,19.43,43.02,1.840,3.55,39.6,0.0,817.628,-0.272,Aguascalientes


#***PARAMETERS FOR VALIDATION***

Statistics are calculated by region for further data validation and outlier detection.

In [None]:
from scipy import stats

def q005(arr):
    return arr.quantile(0.005)

def q01(arr):
    return arr.quantile(0.01)

def q995(arr):
    return arr.quantile(0.995)

def q995(arr):
    return arr.quantile(0.995)

def q98(arr):
    return arr.quantile(0.98)

def q99(arr):
    return arr.quantile(0.99)


dft = meteo_raw.groupby('region').agg(WSpeed_AVG_qh=('WSpeed_Avg', q99),
                            WSpeed_Max_qh=('WSpeed_Max', q98),
                            press_avg=('Press_Avg', 'mean'),
                            press_std=('Press_Avg', 'std'),
                            wspeed_avg=('WSpeed_Avg', 'mean'),
                            wspeed_std=('WSpeed_Avg', 'std'),
                            temp_avg=('Temp_Avg', 'mean'),
                            temp_std=('Temp_Avg', 'std'),
                            ).reset_index()

dft

Unnamed: 0,region,WSpeed_AVG_qh,WSpeed_Max_qh,press_avg,press_std,wspeed_avg,wspeed_std,temp_avg,temp_std
0,Aguascalientes,6.18982,11.57,816.784258,2.371658,2.431338,1.209265,19.263423,5.737975
1,Morelia,5.11529,10.8,806.024384,1.955767,1.684273,1.22846,17.619809,5.023288
2,Saltillo,7.5275,13.48,823.902007,2.970401,3.102591,7.03704,18.29896,6.17848


#***CLEANING DATASET***

z-scores are calculated using data dft dataframe for each datapoint in meteo_raw by region. Then the different validation criteria are evaluated.

In [None]:
meteo_clean = connection.execute('''
with qz as (select *,
    (meteo_raw.Press_Avg - dft.press_avg)/dft.press_std as press_zvalue,
    (meteo_raw.Temp_Avg - dft.temp_avg)/dft.temp_std as temp_zvalue,
    (meteo_raw.WSpeed_Avg - dft.wspeed_avg)/dft.wspeed_std as wspeed_zvalue,

    from meteo_raw

    left join dft on dft.region = meteo_raw.region)

select
    Time,
    region,
    CASE when
        (
            (temp_zvalue between -3 and 3) or (Temp_Avg is NULL)
        )
        then Temp_Avg else NULL
    END as Temp_Avg,


    CASE when
        (
            (wspeed_zvalue between -4 and 4) or (WSpeed_AVG is NULL)
        )
        then WSpeed_AVG else NULL
    END as WSpeed_AVG,

    CASE when
        (
            (WSpeed_Max between 0 and WSpeed_Max_qh) or (WSpeed_Max is NULL)
        )
        then WSpeed_Max else NULL
    END as WSpeed_Max,

    CASE when
        (
            (Rain_Tot >= 0) or (Rain_Tot is NULL)
        )
        then Rain_Tot else NULL
    END as Rain_Tot,

    CASE when
        (
            (press_zvalue between -3 and 3) or (Press_Avg is NULL)
        )
        then Press_Avg else NULL
    END as Press_Avg,

    CASE when
        (
            (Rad_Avg > 0.001) or (Rad_Avg is NULL)
        )
        then Rad_Avg else NULL
    END as Rad_Avg,


    CASE when
        (
            (RH_Avg between 1 and 99) or (RH_Avg is NULL)
        )
        then RH_Avg else NULL
    END as RH_Avg,

    CASE when
        (
            (WDir_AVG between 0.001 and 359.999) or (WDir_AVG is NULL)
        )
        then WDir_AVG else NULL
    END as WDir_AVG


from qz
''').df()

meteo_clean

Unnamed: 0,Time,region,Temp_Avg,WSpeed_AVG,WSpeed_Max,Rain_Tot,Press_Avg,Rad_Avg,RH_Avg,WDir_AVG
0,2015-08-01 00:00:00,Morelia,,,,,,,,
1,2015-08-01 01:00:00,Morelia,,,,,,,,
2,2015-08-01 02:00:00,Morelia,,,,,,,,
3,2015-08-01 03:00:00,Morelia,,,,,,,,
4,2015-08-01 04:00:00,Morelia,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
248494,2024-11-10 23:00:00,Aguascalientes,21.63,1.317,3.25,0.0,817.400,,37.71,3.0
248495,2024-11-11 00:00:00,Aguascalientes,20.20,1.414,3.34,0.0,817.702,,42.13,8.6
248496,2024-11-11 01:00:00,Aguascalientes,19.39,2.281,4.22,0.0,817.713,,44.20,28.3
248497,2024-11-11 02:00:00,Aguascalientes,19.43,1.840,3.55,0.0,817.628,,43.02,39.6




**SORTING DATA BY DATE**

Data is finally sorted by date. Pay attention to regions sice timestamp will repeat dates for each region. If you want to save this dataframe as csv file, then you should run the last section SAVING CLEANED DATASET AS CSV.


In [None]:
meteo_clean = meteo_clean.sort_values(by = 'Time')

meteo_clean


Unnamed: 0,Time,region,Temp_Avg,WSpeed_AVG,WSpeed_Max,Rain_Tot,Press_Avg,Rad_Avg,RH_Avg,WDir_AVG
126015,2015-05-01 00:00:00,Aguascalientes,,,,,,,,
161228,2015-05-01 00:00:00,Saltillo,,,,,,,,
126016,2015-05-01 01:00:00,Aguascalientes,,,,,,,,
161229,2015-05-01 01:00:00,Saltillo,,,,,,,,
126017,2015-05-01 02:00:00,Aguascalientes,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
126013,2024-11-11 01:00:00,Saltillo,13.49,2.295,3.85,0.0,825.039,,86.58,168.9
248496,2024-11-11 01:00:00,Aguascalientes,19.39,2.281,4.22,0.0,817.713,,44.20,28.3
248497,2024-11-11 02:00:00,Aguascalientes,19.43,1.840,3.55,0.0,817.628,,43.02,39.6
126014,2024-11-11 02:00:00,Saltillo,12.98,1.680,3.09,0.0,824.802,,79.42,124.2


#***SPLIT DATA BY REGION***

Uncomment the line for the region you want.

In [None]:
meteo_clean = meteo_clean.loc[meteo_clean['region']=='Morelia']
# meteo_clean = meteo_clean.loc[meteo_clean['region']=='Saltillo']
# meteo_clean = meteo_clean.loc[meteo_clean['region']=='Aguascalientes']



meteo_clean


Unnamed: 0,Time,region,Temp_Avg,WSpeed_AVG,WSpeed_Max,Rain_Tot,Press_Avg,Rad_Avg,RH_Avg,WDir_AVG
0,2015-08-01 00:00:00,Morelia,,,,,,,,
1,2015-08-01 01:00:00,Morelia,,,,,,,,
2,2015-08-01 02:00:00,Morelia,,,,,,,,
3,2015-08-01 03:00:00,Morelia,,,,,,,,
4,2015-08-01 04:00:00,Morelia,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
161223,2024-11-10 15:00:00,Morelia,25.56,3.455,8.380,0.0,804.232,779.620,32.8,32.5
161224,2024-11-10 16:00:00,Morelia,26.17,3.771,8.220,0.0,803.846,590.218,31.0,33.6
161225,2024-11-10 17:00:00,Morelia,26.30,3.392,7.506,0.0,803.720,350.195,31.1,30.7
161226,2024-11-10 18:00:00,Morelia,25.69,1.824,5.622,0.0,804.013,102.876,32.6,39.2


#***SELECT SPECIFIC TIME PERIOD***

Insert speficfic start and end dates including time information following the format YYY-mm-dd HH:MM:SS

In [None]:
start_date ='2016-01-01 00:00:00'
end_date = '2023-12-31 23:00:00'

period = (meteo_clean['Time'] >= start_date) & (meteo_clean['Time'] <= end_date)

meteo_clean = meteo_clean.loc[period]

meteo_clean

Unnamed: 0,Time,region,Temp_Avg,WSpeed_AVG,WSpeed_Max,Rain_Tot,Press_Avg,Rad_Avg,RH_Avg,WDir_AVG
5720,2016-01-01 00:00:00,Morelia,14.87,0.744,,0.0,806.0,,79.24,162.9
5721,2016-01-01 01:00:00,Morelia,13.91,0.705,,0.0,806.0,,83.08,79.8
5722,2016-01-01 02:00:00,Morelia,13.02,0.805,,0.0,805.0,,87.56,141.6
5723,2016-01-01 03:00:00,Morelia,12.20,0.779,,0.0,805.0,,90.83,150.9
5724,2016-01-01 04:00:00,Morelia,11.70,0.949,,0.0,805.0,,92.28,199.7
...,...,...,...,...,...,...,...,...,...,...
147523,2023-12-31 19:00:00,Morelia,,,,,,,,
147524,2023-12-31 20:00:00,Morelia,,,,,,,,
147525,2023-12-31 21:00:00,Morelia,,,,,,,,
147526,2023-12-31 22:00:00,Morelia,,,,,,,,


#***24h AVERAGE AND DIURNAL DATA***

#24-hour Average

The calculation considers a completeness criterion to preserve data representativeness. The 24-hour mean is calculated only if there is a minimum of 18 hours of data available for each day. Additionally, a special method is applied to calculate the mean wind direction, accounting for its circular nature. The results are stored in a new DataFrame called daily_mean, which can be saved as a CSV file using the code in the next cell.

#Diurnal Data
For better data representation, the diurnal values are computed as the median of each parameter.


In [None]:
'24h average'

meteo_clean.loc[:,'Date'] = meteo_clean.Time.dt.date   # Create column with dates only

def calculate_daily_mean(group, is_circular=False):   # is_circular conditional for circular variables such as wind direction. To be declared on function custom_mean.
  valid_count = group.notna().sum()
  if valid_count >= 18:
    if is_circular:
      rad = np.deg2rad(group)
      mean_sin = np.mean(np.sin(rad))
      mean_cos = np.mean(np.cos(rad))
      mean_angle = np.arctan2(mean_sin, mean_cos)
      return np.rad2deg(mean_angle).round(2) % 360     # Ensure that mean_angle is between 0 an 360
    else:
      return group.mean().round(2)
  else:
    return np.nan

def custom_mean(group):
  return {
      'Temp_Avg': calculate_daily_mean(group['Temp_Avg']),
      'WSpeed_AVG': calculate_daily_mean(group['WSpeed_AVG']),
      'WSpeed_Max': calculate_daily_mean(group['WSpeed_Max']),
      'Rain_Tot': calculate_daily_mean(group['Rain_Tot']),
      'Press_Avg': calculate_daily_mean(group['Press_Avg']),
      'Rad_Avg': calculate_daily_mean(group['Rad_Avg']),
      'RH_Avg': calculate_daily_mean(group['RH_Avg']),
      'WDir_AVG': calculate_daily_mean(group['WDir_AVG'], is_circular=True),
  }

daily_mean = meteo_clean.groupby(['region', 'Date']).apply(custom_mean).apply(pd.Series)

print(daily_mean)

'hourly average (output: 24 lines) diurnal cycles taking the median for better representation'

# meteo_clean = meteo_clean.groupby(['region', meteo_clean['Time'].dt.hour]).median().reset_index(drop=True)

del meteo_clean['Date']
meteo_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meteo_clean.loc[:,'Date'] = meteo_clean.Time.dt.date   # Create column with dates only
  daily_mean = meteo_clean.groupby(['region', 'Date']).apply(custom_mean).apply(pd.Series)


                    Temp_Avg  WSpeed_AVG  WSpeed_Max  Rain_Tot  Press_Avg  \
region  Date                                                                
Morelia 2016-01-01     16.96        1.32         NaN       0.0     806.33   
        2016-01-02     16.12        1.85         NaN       0.0     807.04   
        2016-01-03     16.40        1.38         NaN       0.0     806.29   
        2016-01-04     14.08        1.11         NaN       0.0     805.50   
        2016-01-05     13.90        1.79         NaN       0.0     804.92   
...                      ...         ...         ...       ...        ...   
        2023-10-27     17.82        1.08        2.92       0.2     805.27   
        2023-10-28     18.43        0.78        2.72       0.0     805.51   
        2023-10-29     17.98        0.71        2.44       0.0     804.24   
        2023-10-30     17.82        1.03        3.00       0.0     804.15   
        2023-10-31       NaN         NaN         NaN       NaN        NaN   

Unnamed: 0,Time,region,Temp_Avg,WSpeed_AVG,WSpeed_Max,Rain_Tot,Press_Avg,Rad_Avg,RH_Avg,WDir_AVG
5720,2016-01-01 00:00:00,Morelia,14.87,0.744,,0.0,806.0,,79.24,162.9
5721,2016-01-01 01:00:00,Morelia,13.91,0.705,,0.0,806.0,,83.08,79.8
5722,2016-01-01 02:00:00,Morelia,13.02,0.805,,0.0,805.0,,87.56,141.6
5723,2016-01-01 03:00:00,Morelia,12.20,0.779,,0.0,805.0,,90.83,150.9
5724,2016-01-01 04:00:00,Morelia,11.70,0.949,,0.0,805.0,,92.28,199.7
...,...,...,...,...,...,...,...,...,...,...
146059,2023-10-31 19:00:00,Morelia,,,,,,,,
146060,2023-10-31 20:00:00,Morelia,,,,,,,,
146061,2023-10-31 21:00:00,Morelia,,,,,,,,
146062,2023-10-31 22:00:00,Morelia,,,,,,,,


#***SAVING CLEANED DATASET AS CSV FILE***

In [None]:
date_now = datetime.datetime.now() - datetime.timedelta(hours=6)
date_now = date_now.strftime('%Y-%m-%d %H:%M')

file_name = 'meteo_clean_' + str(date_now) + '.csv'

meteo_clean.to_csv('/content/drive/MyDrive/ATPAD_COLAB/OUTPUT_FILES/'+ file_name)

daily_mean.to_csv('/content/drive/MyDrive/ATPAD_COLAB/OUTPUT_FILES/daily_mean_meteo_' + str(date_now) + '.csv')