In [30]:
import pandas as pd
import numpy as np
import os

In [31]:
def get_station_meta(station_number,file_prefix='hly'):
    station_meta = dict()
    station_header_file = f'data/{file_prefix}{station_number}_header.csv'
    df_header = pd.read_csv(station_header_file)
    for row in df_header[df_header.columns[0]]:
        if 'Latitude' in row:
            split_vals = row.split(',')
            station_meta['latitude'] = float(split_vals[0].replace('Latitude:',''))
            station_meta['longitude'] = float(split_vals[-1].replace('Longitude:',''))
    return station_meta

In [88]:
raw_data_files = [file for file in os.listdir('data/') if 'gz.parquet' in file and ('hly' in file or 'dly' in file)]
#raw_data_files = list(np.random.choice(raw_data_files,15))
#raw_data_files

In [None]:
df_processed = pd.DataFrame()
for file in raw_data_files:
    station_number = int(file.replace('.gz','').replace('.parquet','').replace('hly','').replace('dly',''))
    station_file = f'data/{file}'
    file_prefix = file[0:3]
    station_meta = get_station_meta(station_number,file_prefix)
    print(station_file)
    df = pd.read_parquet(station_file)
    print(df.shape)
    # only process data if we have some
    if len(df)>0:
        df = df.set_index(pd.to_datetime(df.index))
        df.columns = [f'{file_prefix}_{col}' for col in df.columns.values]
        df = df.set_index(df.index.to_series().dt.date)
        if file_prefix == 'dly':
            df = df.groupby(by=df.index).agg([np.mean])
        elif file_prefix == 'hly':
            df = df.groupby(by=df.index).agg([np.mean, np.min, np.max, np.std])
        df.columns = ['_'.join(col).strip() for col in df.columns.values]
        df = df.dropna(how='all')
        df = df.round(2)
        df['station_number'] = station_number
        df['latitude'] = station_meta['latitude']
        df['longitude'] = station_meta['longitude']
        print(df.shape)
        df_processed = df_processed.append(df,sort=True)
        print(df_processed.shape)
print(df_processed.shape)
df_processed = df_processed.groupby([df_processed.index,'station_number']).max()
print(df_processed.shape)
df_processed.to_parquet('data/daily_by_station.gz.parquet',compression='gzip')
df_processed.head()

data/dly1024.gz.parquet
(27426, 2)
(27426, 5)
(27426, 5)
data/dly1042.gz.parquet
(24136, 2)
(24136, 5)
(51562, 5)
data/dly1043.gz.parquet
(16985, 2)
(16985, 5)
(68547, 5)
data/dly1075.gz.parquet
(19455, 23)
(19455, 26)
(88002, 26)
data/dly108.gz.parquet
(28579, 2)
(28579, 5)
(116581, 26)
data/dly1103.gz.parquet
(22767, 2)


In [64]:
#df_processed.index.to_series().dt.date