## Filling missing data for each station

In [2]:
import os
import pandas as pd

In [27]:
BASE_DATA_PATH = r'C:\D\Whitireia\ARP\data'

In [28]:
unzipped_data = os.path.join(BASE_DATA_PATH, r'unzipped_data')
list_csv_files = os.listdir(unzipped_data)
print(unzipped_data)

C:\D\Whitireia\ARP\data\unzipped_data


In [23]:
rel_hum_parser_dict = {
    12442: '12442__Temperature__H.csv',
    18234: '18234__Temperature__H.csv',
    25354: '25354__Temperature__A.csv',
    25531: '25531__Temperature__A.csv',
    2592: '2592__Temperature__A.csv',
    2685: '2685__Temperature__A.csv',
    3445: '3445__Temperature__A.csv',
    40750: '40750__Temperature__H.csv',
    40984: '40984__Temperature__H.csv',
    41212: '41212__Temperature__H.csv',
    41229: '41229__Temperature__A.csv',
    41559: '41559__Temperature__H.csv',
    8567: '8567__Temperature__A.csv'
}

In [33]:
list_prefilled_stations = os.listdir(os.path.join(BASE_DATA_PATH, r'stations_filled_daily'))
filled_stations = os.listdir(os.path.join(BASE_DATA_PATH, r'stations_filled_all_data'))
remain_stations = [e for e in list_prefilled_stations if e not in filled_stations]
print(remain_stations)

['12442.csv', '18234.csv', '21938.csv', '25354.csv', '25531.csv', '2592.csv', '2685.csv', '3145.csv', '31857.csv', '3445.csv', '40750.csv', '40984.csv', '41212.csv', '41229.csv', '41559.csv', '8567.csv']


In [38]:
def get_df(station: int) -> pd.DataFrame:
    """Takes id, Returns df from stations_filled_daily
    with 'Observation time UTC' as datetime as index
    """
    df = pd.read_csv(
    os.path.join(BASE_DATA_PATH, r'stations_filled_daily', f'{station}.csv')
    ).drop('Unnamed: 0', axis=1)
    df['Observation time UTC'] = pd.to_datetime(df['Observation time UTC']).dt.date
    df.set_index('Observation time UTC', inplace=True)
    return df

In [41]:
def get_df_rel_humidity(station: int, station_column: str, duration_param: str) -> pd.DataFrame:
    """Returns df with rel_hum data aggregated by day 
    from unzipped_data
    """
    df = pd.read_csv(
    os.path.join(BASE_DATA_PATH, r'unzipped_data', f'{station}__{station_column}__{duration_param}'), 
    parse_dates=['Observation time UTC']
    )
    # print(df.info())
    # print(df.columns)

    df.drop(['Frequency [D/H/S]',
        'Maximum Temperature [Deg C]', 'Minimum Temperature [Deg C]',
        'Grass Temperature [Deg C]', 'Mean Temperature [Deg C]', 'Data Source'], 
        axis=1,inplace=True
        )
    
    df = df.groupby(pd.Grouper(key='Observation time UTC', freq='1D')).mean().sort_index(ascending=False)
    return df

In [40]:
df = get_df(2592)
df

Unnamed: 0_level_0,station_id,Temperature,Screen_Observations,Rain,Wind
Observation time UTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-12-09,2592.0,,,,11.41
2024-12-08,2592.0,,,,14.11
2024-12-07,2592.0,,,,9.39
2024-12-06,2592.0,,,0.0,7.41
2024-12-05,2592.0,,,0.0,14.12
...,...,...,...,...,...
1985-03-19,2592.0,14.5,,,
1985-03-18,2592.0,15.2,,,
1985-03-17,2592.0,16.5,,,
1985-03-16,2592.0,17.0,,,
