# Cleaning NOAA GHCN-D Dataset with Modin

Run all the cells to load and clean dataset into `csv` files to be used with NumS in a separate notebook.

In [None]:
import ray
ray.init(ignore_reinit_error=True)
import modin.pandas as pd
import warnings
from tqdm.auto import tqdm
warnings.filterwarnings("ignore");

In [None]:
inventory = pd.read_fwf('s3://noaa-ghcn-pds/ghcnd-inventory.txt', widths=[12, 9, 10, 4, 5, 5], header=None, names=["ID", "LATITUDE", "LONGITUDE", "ELEMENT", "FIRSTYEAR", "LASTYEAR"])
stations = pd.read_fwf('s3://noaa-ghcn-pds/ghcnd-stations.txt', widths=[12, 9, 10, 7, 3, 31, 4, 4, 6], header=None, names=["ID", "LATITUDE", "LONGITUDE", "ELEVATION", "STATE", "NAME", "GSN FLAG", "HCN/CRN FLAG", "WMO ID"])
elements = ["PRCP", "SNOW", "SNWD", "TMAX", "TMIN"]

In [None]:
def df_loader(year):
    df = pd.read_csv('s3://noaa-ghcn-pds/csv/' + str(year) + '.csv', header=None, names=["ID", "YEAR/MONTH/DAY", "ELEMENT", "DATA VALUE", "M-FLAG", "Q-FLAG", "S-FLAG", "OBS-TIME"], quoting=3)
    df["YEAR/MONTH/DAY"] = pd.to_datetime(df["YEAR/MONTH/DAY"], format="%Y%m%d")
    return df

In [None]:
def design_matrix(years, elements, target=None, local=False):
    """
    Set target to your "y" predictor. If y has NaNs or missing values, we will drop the data row.
    """
    df_design = pd.DataFrame()
    
    for year in tqdm(years):
        if local:
            df = df_loader(year, local=local)
        else:
            df = df_loader(year)
            
        if target[0] not in df["ELEMENT"].unique():
            continue

        df = df[df['ELEMENT'].isin(elements)]
        df = pd.pivot_table(df, index=["ID", "YEAR/MONTH/DAY"], columns="ELEMENT", values="DATA VALUE").reset_index(level=[0,1])
        df = df.merge(stations[["ID", "LATITUDE", "LONGITUDE", "ELEVATION"]], how='inner', on='ID')
        
        if target:
            df = df.dropna(subset=target)
        df = df.dropna()
        
        
        df["YEAR/MONTH/DAY"] = df["YEAR/MONTH/DAY"].apply(lambda x: pd.Period(x, freq='D').day_of_year)
        df.rename(columns={"YEAR/MONTH/DAY": "DAYOFYEAR"})
        df["TMAX"] = df["TMAX"] / 10
        df["TMIN"] = df["TMIN"] / 10
        df["TAVG"] = (df["TMAX"] + df["TMIN"]) / 2
        df["TRANGE"] = df["TMAX"] - df["TMIN"]
        
        if df_design.empty:
            df_design = df
        else:
            df_design = df_design.append(df)
    return df_design


In [None]:
years = list(range(2010, 2021)) # The amount of years to download
features = ['YEAR/MONTH/DAY', 'TMAX', 'TMIN', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'TAVG', 'TRANGE']
labels = ['PRCP']

In [None]:
for year in tqdm(years):
    df = design_matrix([year], ['PRCP', 'TMAX', 'TMIN'], target=['PRCP'], local=True)
    if not df.empty:
        X = df[features]
        y = df[labels]
        y_np = df[labels].to_numpy()
        y_np[y_np > 0.0] = 1.0
        df["PRCP_BIN"] = y_np
        y_bin = df["PRCP_BIN"]

        X.to_csv('data/X_' + str(year) + ".csv", index_label=False)
        y.to_csv('data/y_' + str(year) + ".csv", index_label=False)
        y_bin.to_csv('data/y_bin_' + str(year) + ".csv", index_label=False)