In [1]:
import pandas as pd
import numpy as np
import os

In [13]:
def get_station_meta(station_number):
    station_meta = dict()
    station_header_file = f'data/hly{station_number}_header.csv'
    df_header = pd.read_csv(station_header_file)
    for row in df_header[df_header.columns[0]]:
        if 'Latitude' in row:
            split_vals = row.split(',')
            station_meta['latitude'] = float(split_vals[0].replace('Latitude:',''))
            station_meta['longitude'] = float(split_vals[-1].replace('Longitude:',''))
    return station_meta

In [2]:
raw_data_files = [file for file in os.listdir('data/') if 'gz.parquet' in file and ('hly' in file or 'dly' in file)]
raw_data_files

['hly1075.gz.parquet',
 'hly1175.gz.parquet',
 'hly1275.gz.parquet',
 'hly1375.gz.parquet',
 'hly1475.gz.parquet',
 'hly175.gz.parquet',
 'hly1775.gz.parquet',
 'hly1875.gz.parquet',
 'hly1975.gz.parquet',
 'hly2075.gz.parquet',
 'hly2175.gz.parquet',
 'hly2275.gz.parquet',
 'hly2375.gz.parquet',
 'hly275.gz.parquet',
 'hly3723.gz.parquet',
 'hly375.gz.parquet',
 'hly3904.gz.parquet',
 'hly518.gz.parquet',
 'hly532.gz.parquet',
 'hly575.gz.parquet',
 'hly675.gz.parquet',
 'hly775.gz.parquet',
 'hly875.gz.parquet']

In [3]:
# get station info df
df_station_details = pd.read_csv('data/station_details.csv')
df_station_details = df_station_details.set_index('Station Number')

In [14]:
df_processed = pd.DataFrame()
for file in raw_data_files:
    print(df_processed.shape)
    station_number = int(file.replace('.gz','').replace('.parquet','').replace('hly','').replace('dly',''))
    station_file = f'data/{file}'
    station_meta = get_station_meta(station_number)
    print(station_file)
    df = pd.read_parquet(station_file,columns=['temp','rain','wdsp','rhum'])
    print(df.shape)
    df = df.dropna()
    df = df.set_index(pd.to_datetime(df.index))
    df = df.resample('1d').apply(['mean','max','min','std'])
    df = df.round(2)
    df.columns = ['_'.join(col).strip() for col in df.columns.values]
    df['station_number'] = station_number
    df['latitide'] = station_meta['latitude']
    df['longitude'] = station_meta['longitude']
    #df['latitide'] = df_station_details.iloc[station_number]['Latitude']
    #df['longitude'] = df_station_details.iloc[station_number]['Longitude']
    print(df.shape)
    df_processed = df_processed.append(df)
print(df_processed.shape)
df_processed.to_parquet('data/data_daily_by_station.gz.parquet',compression='gzip')

(0, 0)
data/hly1075.gz.parquet
(247992, 4)
(11139, 19)
(11139, 19)
data/hly1175.gz.parquet
(127080, 4)
(5268, 19)
(16407, 19)
data/hly1275.gz.parquet
(100824, 3)
(4334, 15)
(20741, 19)
data/hly1375.gz.parquet
(100823, 4)
(4566, 19)
(25307, 19)
data/hly1475.gz.parquet
(100080, 4)
(4165, 19)
(29472, 19)
data/hly175.gz.parquet
(112080, 3)
(5175, 15)
(34647, 19)
data/hly1775.gz.parquet
(127128, 4)
(5295, 19)
(39942, 19)
data/hly1875.gz.parquet
(70248, 4)
(2928, 19)
(42870, 19)
data/hly1975.gz.parquet
(100128, 4)
(4588, 19)
(47458, 19)
data/hly2075.gz.parquet
(185347, 4)
(6703, 19)
(54161, 19)
data/hly2175.gz.parquet
(267276, 4)
(11139, 19)
(65300, 19)
data/hly2275.gz.parquet
(267312, 4)
(11139, 19)
(76439, 19)
data/hly2375.gz.parquet
(267313, 4)
(11139, 19)
(87578, 19)
data/hly275.gz.parquet
(130416, 4)
(5448, 19)
(93026, 19)
data/hly3723.gz.parquet
(267313, 4)
(11139, 19)
(104165, 19)
data/hly375.gz.parquet
(139512, 4)
(5802, 19)
(109967, 19)
data/hly3904.gz.parquet
(267313, 4)
(11139, 19

  result = infer_dtype(pandas_collection)
