## Filling missing data for each station

In [54]:
import os
import pandas as pd

In [55]:
BASE_DATA_PATH = r'C:\D\Whitireia\ARP\data'

In [56]:
unzipped_data = os.path.join(BASE_DATA_PATH, r'unzipped_data')
list_csv_files = os.listdir(unzipped_data)
print(unzipped_data)

C:\D\Whitireia\ARP\data\unzipped_data


In [57]:
rel_hum_parser_dict = {
    12442: '12442__Temperature__H.csv',
    18234: '18234__Temperature__H.csv',
    25354: '25354__Temperature__A.csv',
    25531: '25531__Temperature__A.csv',
    2592: '2592__Temperature__A.csv',
    2685: '2685__Temperature__A.csv',
    3445: '3445__Temperature__A.csv',
    40750: '40750__Temperature__H.csv',
    40984: '40984__Temperature__H.csv',
    41212: '41212__Temperature__H.csv',
    41229: '41229__Temperature__A.csv',
    41559: '41559__Temperature__H.csv',
    8567: '8567__Temperature__A.csv',
    21938: '21938__Temperature__H.csv',
    3145: '3145__Screen_Observations__D.csv',
    31857: '31857__Screen_Observations__D.csv'
}

In [58]:
list_prefilled_stations = os.listdir(os.path.join(BASE_DATA_PATH, r'stations_filled_daily'))
filled_stations = os.listdir(os.path.join(BASE_DATA_PATH, r'stations_filled_all_data'))
remain_stations = [e for e in list_prefilled_stations if e not in filled_stations]
print(len(remain_stations), remain_stations)

8 ['31857.csv', '3445.csv', '40750.csv', '40984.csv', '41212.csv', '41229.csv', '41559.csv', '8567.csv']


In [59]:
def get_df(station: int) -> pd.DataFrame:
    """Takes id, Returns df from stations_filled_daily
    with 'Observation time UTC' (datetime type) as index
    """
    df = pd.read_csv(
    os.path.join(BASE_DATA_PATH, r'stations_filled_daily', f'{station}.csv')
    ).drop('Unnamed: 0', axis=1)
    df['Observation time UTC'] = pd.to_datetime(df['Observation time UTC']).dt.date
    df.set_index('Observation time UTC', inplace=True)
    return df

In [63]:
def get_df_rel_humidity(station: int, station_column: str, duration_param: str) -> pd.DataFrame:
    """Returns df with rel_hum data aggregated by day 
    from unzipped_data
    """
    df = pd.read_csv(
    os.path.join(BASE_DATA_PATH, r'unzipped_data', f'{station}__{station_column}__{duration_param}'), 
    parse_dates=['Observation time UTC']
    )
    # print(df.info())
    # print(df.columns)

    if (station != 3145) and (station != 31857):
        df.drop(['Frequency [D/H/S]',
            'Maximum Temperature [Deg C]', 'Minimum Temperature [Deg C]',
            'Grass Temperature [Deg C]', 'Mean Temperature [Deg C]', 'Data Source'], 
            axis=1,inplace=True
            )
    else:
        df.drop(['Air temperature (dry bulb) [Deg C]', 'Wet bulb temperature [Deg C]', 
            'Dew point temperature [Deg C]','Data Source'], 
            axis=1,inplace=True
            )
        df.columns = ['Observation time UTC', 'Mean Relative Humidity [percent]']
    
    df = df.groupby(pd.Grouper(key='Observation time UTC', freq='1D')).mean().sort_index(ascending=False)
    df.index = pd.to_datetime(df.index).date
    return df

In [61]:
print(list_prefilled_stations)
print(type(list_prefilled_stations[0]))

['12442.csv', '18234.csv', '21938.csv', '25354.csv', '25531.csv', '2592.csv', '2685.csv', '3145.csv', '31857.csv', '3445.csv', '40750.csv', '40984.csv', '41212.csv', '41229.csv', '41559.csv', '8567.csv']
<class 'str'>


In [64]:
for station in list_prefilled_stations:
    station_id = int(station.split('.')[0])
    station_column = rel_hum_parser_dict[station_id].split('__')[1]
    duration_param = rel_hum_parser_dict[station_id].split('__')[2]
    print(f"Station Id: '{station_id}', Station column: '{station_column}', Duration: '{duration_param}'")
    
    station_df = get_df(station=station_id)
    rel_hum_df = get_df_rel_humidity(station=station_id, station_column=station_column, duration_param=duration_param)

    result_df = pd.merge(station_df, rel_hum_df, how='left', left_index=True, right_index=True)
    result_df['Screen_Observations'] = result_df['Mean Relative Humidity [percent]']
    result_df.drop('Mean Relative Humidity [percent]', axis=1, inplace=True)
    result_df.to_csv(os.path.join(BASE_DATA_PATH, r'stations_filled_all_data', station))
    result_df

Station Id: '12442', Station column: 'Temperature', Duration: 'H.csv'
Station Id: '18234', Station column: 'Temperature', Duration: 'H.csv'
Station Id: '21938', Station column: 'Temperature', Duration: 'H.csv'
Station Id: '25354', Station column: 'Temperature', Duration: 'A.csv'
Station Id: '25531', Station column: 'Temperature', Duration: 'A.csv'
Station Id: '2592', Station column: 'Temperature', Duration: 'A.csv'
Station Id: '2685', Station column: 'Temperature', Duration: 'A.csv'
Station Id: '3145', Station column: 'Screen_Observations', Duration: 'D.csv'
Station Id: '31857', Station column: 'Screen_Observations', Duration: 'D.csv'
Station Id: '3445', Station column: 'Temperature', Duration: 'A.csv'
Station Id: '40750', Station column: 'Temperature', Duration: 'H.csv'
Station Id: '40984', Station column: 'Temperature', Duration: 'H.csv'
Station Id: '41212', Station column: 'Temperature', Duration: 'H.csv'
Station Id: '41229', Station column: 'Temperature', Duration: 'A.csv'
Station 