In [4]:
# Basic setting for Jupyter_notebook to import utils
import os
import sys

notebook_path = os.path.abspath("")
project_root = os.path.abspath(os.path.join(notebook_path, "../../"))

sys.path.append(project_root)

In [6]:
import os
import sys
import pandas as pd
from utils import folder_utils
from tqdm import tqdm

In [7]:
def extract_data_to_df(country, data_folder, data_category, output_folder):
    # Specify the folder path
    input_folder_path = folder_utils.find_folder(
        country, data_folder, data_category, output_folder
    )
    # Initialize a dataframe to store all the data from the same country
    raw_df = pd.DataFrame()
    # Get the filenames of all CSV files under the folder except the station network file
    csv_files = [
        f
        for f in os.listdir(input_folder_path)
        if f.endswith('.csv') and "asos_station_network" not in f
    ]
    # Read and merge the csv files in queue
    for csv_file in tqdm(csv_files):
        csv_file_path = os.path.join(input_folder_path, csv_file)
        df = pd.read_csv(csv_file_path)
        raw_df = pd.concat([raw_df, df], ignore_index=True)

    return raw_df

In [37]:
def process_asos_rawdata(df):
    # Split "valid" column into "date" and "time" columns
    df['date'] = (
        df['valid'].str.split(' ', expand=True)[0].str.replace('-', '').astype(int)
    )
    df['time'] = df['valid'].str.split(' ', expand=True)[1]

    # Convert Fahrenheit to Celsius for "tmpf", "feel" and "dwpf" columns
    df['tmpf'] = (df['tmpf'] - 32) * 5 / 9
    df['tmpf'] = df['tmpf'].round(1)

    df['dwpf'] = (df['dwpf'] - 32) * 5 / 9
    df['dwpf'] = df['dwpf'].round(1)

    # Convert Fahrenheit to Celsius for "feel" column
    df['feel'] = (df['feel'] - 32) * 5 / 9
    df['feel'] = df['feel'].round(1)

    # Convert knots to m/s for "sknt" and "gust" columns
    df['sknt'] = df['sknt'] * 0.514444
    df['gust'] = df['gust'] * 0.514444

    # Convert inches to meters for "p01i" and "alti" columns
    df['p01i'] = pd.to_numeric(df['p01i'], errors='coerce') * 0.0254
    df['alti'] = df['alti'] * 0.0254

    # Convert millibar to Pa for "mslp" column
    df['mslp'] = df['mslp'] * 100

    # Drop columns
    columns_to_drop = [
        # 'lon',
        # 'lat',
        # 'elevation',
        'valid',
        'skyc1',
        'skyc2',
        'skyc3',
        'skyc4',
        'skyl1',
        'skyl2',
        'skyl3',
        'skyl4',
        'wxcodes',
        'ice_accretion_1hr',
        'ice_accretion_3hr',
        'ice_accretion_6hr',
        'peak_wind_gust',
        'peak_wind_drct',
        'peak_wind_time',
        'metar',
        'snowdepth',
    ]
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    return df

In [9]:
def save_asos_processed_data(
    processed_df, country, data_folder, data_category, output_folder
):
    output_directory = folder_utils.create_folder(
        country, data_folder, data_category, output_folder
    )
    output_filename = f"{country}_ASOS_processed_data.csv"
    output_filepath = os.path.join(output_directory, output_filename)
    processed_df.to_csv(output_filepath, index=False, encoding="utf-8")
    print(f'{output_filename} done!')

In [29]:
# Example usage

country = "GB"
data_folder = "data"
data_test_category = "test_data"
data_read_category = "raw_data"
data_save_category = "processed_data"
output_folder = "ASOS_DATA"

In [34]:
raw_df = extract_data_to_df(country, data_folder, data_read_category , output_folder)

  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
100%|█████████████████████████████████████████████| 112/112 [00:16<00:00,  6.65it/s]


In [35]:
raw_df

Unnamed: 0,station,valid,lon,lat,elevation,tmpf,dwpf,relh,drct,sknt,...,wxcodes,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,snowdepth
0,EGUN,2022-01-01 00:26,0.4864,52.3619,10.0,53.6,53.6,100.00,210.0,9.0,...,,,,,,,,53.6,EGUN 010026Z AUTO 21009KT 9999 BKN013 12/12 A3...,
1,EGUN,2022-01-01 00:56,0.4864,52.3619,10.0,54.0,54.0,100.00,210.0,10.0,...,,,,,,,,54.0,EGUN 010056Z AUTO 21010KT 9999 SCT012 BKN035 1...,
2,EGUN,2022-01-01 01:26,0.4864,52.3619,10.0,55.4,55.4,100.00,200.0,9.0,...,,,,,,,,55.4,EGUN 010126Z AUTO 20009KT 9999 SCT034 13/13 A3...,
3,EGUN,2022-01-01 01:56,0.4864,52.3619,10.0,54.7,54.7,100.00,210.0,11.0,...,,,,,,,,54.7,EGUN 010156Z AUTO 21011KT 9999 CLR 13/13 A3008...,
4,EGUN,2022-01-01 02:56,0.4864,52.3619,10.0,55.6,55.6,100.00,200.0,9.0,...,,,,,,,,55.6,EGUN 010256Z AUTO 20009KT 9999 CLR 13/13 A3007...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2038806,EGKB,2023-08-01 19:20,0.0325,51.3308,183.0,60.8,57.2,87.92,210.0,11.0,...,,,,,,,,60.8,EGKB 011920Z 21011KT 170V240 9999 FEW015 16/14...,
2038807,EGKB,2023-08-01 19:50,0.0325,51.3308,183.0,60.8,55.4,82.37,210.0,10.0,...,,,,,,,,60.8,EGKB 011950Z 21010KT 9999 FEW015 16/13 Q1001,
2038808,EGKB,2023-08-01 20:20,0.0325,51.3308,183.0,59.0,55.4,87.83,200.0,11.0,...,,,,,,,,59.0,EGKB 012020Z 20011KT 9999 FEW025 15/13 Q1001,
2038809,EGKB,2023-08-01 20:50,0.0325,51.3308,183.0,59.0,55.4,87.83,210.0,11.0,...,RA,,,,,,,59.0,EGKB 012050Z 21011KT 9999 RA SCT050 15/13 Q1001,


In [38]:
processed_df = process_asos_rawdata(raw_df)

In [39]:
processed_df

Unnamed: 0,station,lon,lat,elevation,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,feel,date,time
0,EGUN,0.4864,52.3619,10.0,-11.1,-11.1,100.00,210.0,2.381874,0.0,0.764032,101890.0,6.21,,-11.1,20220101,00:26
1,EGUN,0.4864,52.3619,10.0,-11.0,-11.0,100.00,210.0,2.646526,0.0,0.764032,101890.0,6.21,,-11.0,20220101,00:56
2,EGUN,0.4864,52.3619,10.0,-10.6,-10.6,100.00,200.0,2.381874,0.0,0.763778,101860.0,6.21,,-10.6,20220101,01:26
3,EGUN,0.4864,52.3619,10.0,-10.8,-10.8,100.00,210.0,2.911179,0.0,0.764032,101890.0,6.21,,-10.8,20220101,01:56
4,EGUN,0.4864,52.3619,10.0,-10.5,-10.5,100.00,200.0,2.381874,0.0,0.763778,101860.0,6.21,,-10.5,20220101,02:56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2038806,EGKB,0.0325,51.3308,183.0,-8.9,-10.0,87.92,210.0,2.911179,0.0,0.750824,,6.21,,-8.9,20230801,19:20
2038807,EGKB,0.0325,51.3308,183.0,-8.9,-10.6,82.37,210.0,2.646526,0.0,0.750824,,6.21,,-8.9,20230801,19:50
2038808,EGKB,0.0325,51.3308,183.0,-9.4,-10.6,87.83,200.0,2.911179,0.0,0.750824,,6.21,,-9.4,20230801,20:20
2038809,EGKB,0.0325,51.3308,183.0,-9.4,-10.6,87.83,210.0,2.911179,0.0,0.750824,,6.21,,-9.4,20230801,20:50


In [27]:
save_asos_processed_data(
    processed_df, country, data_folder, data_save_category, output_folder
)

Folder '/Users/ww721/JupyterNotebookPath/IRP_20220602/irp_ww721_bakcup/data/processed_data/ASOS_DATA/GB_ASOS_DATA' created successfully.
GB_ASOS_processed_data.csv done!


In [None]:
/Users/ww721/JupyterNotebookPath/IRP_20220602/irp_ww721_bakcup/data/test_data/ASOS_DATA
/Users/ww721/JupyterNotebookPath/IRP_20220602/irp_ww721_bakcup/data/test_data/ASOS_DATA/GB_ASOS_DATA'