In [33]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from datetime import timedelta

import data_utils

In [42]:
years = [2000, 2001, 2002, 2003, 2004, 2005, 2017]
processed_data = Path("processed_data/")
for year in years:
    year_folder = processed_data / str(year)
    year_folder.mkdir(parents=True, exist_ok=True)

    # tec
    tec = pd.read_csv(f"parsed_data/tucu-year-{year}.csv", index_col=0, parse_dates=True)

    # resample & fill tec
    tec = tec.resample(rule=timedelta(0,60)).asfreq() # resample to 60s
    tec = tec.interpolate(method="linear", limit=10) # interpolate missing values with gaps shorter than 10 minutes
    
    # create groups separating by nans
    tec["group"] = tec.isnull().all(axis=1).cumsum()

    # symh 
    symh = data_utils.load_symh_wdc(f"sym/ASY-SYM-WDCformat-{year}.dat")

    groups = tec.group.max()
    min_tec_samples = 1440*2
    for group in range(groups+1):
        no_of_tecs = tec.loc[tec.group == group].count().tec

        if no_of_tecs <= min_tec_samples:
            continue 
        print(group, no_of_tecs)

        group_tec = tec.loc[tec.group == group].dropna()
        start_date = group_tec.index[0].to_pydatetime()
        end_date = group_tec.index[-1].to_pydatetime()
        
        df = pd.merge(left=group_tec, 
            right=symh, 
            right_on=symh.index, 
            how="left",left_index=True).loc[:,["tec", "symh"]]
        filename = f"tec-symh-{start_date.strftime("%Y-%m-%d")}-{end_date.strftime("%Y-%m-%d")}.csv"
        filepath = year_folder / filename
        df.to_csv(filepath)
        print(filepath, "saved!")
        

1 70993
processed_data/2000/tec-symh-2000-01-01-2000-02-19.csv saved!
129 56534
processed_data/2000/tec-symh-2000-02-19-2000-03-29.csv saved!
154 27849
processed_data/2000/tec-symh-2000-03-29-2000-04-18.csv saved!
1585 21609
processed_data/2000/tec-symh-2000-04-19-2000-05-04.csv saved!
3016 193538
processed_data/2000/tec-symh-2000-05-05-2000-09-16.csv saved!
3018 17230
processed_data/2000/tec-symh-2000-09-16-2000-09-28.csv saved!
3153 14312
processed_data/2000/tec-symh-2000-09-29-2000-10-09.csv saved!
3221 120355
processed_data/2000/tec-symh-2000-10-09-2000-12-31.csv saved!
1 17289
processed_data/2001/tec-symh-2001-01-01-2001-01-13.csv saved!
1432 12969
processed_data/2001/tec-symh-2001-01-14-2001-01-23.csv saved!
33998 6132
processed_data/2001/tec-symh-2001-02-14-2001-02-18.csv saved!
34171 16318
processed_data/2001/tec-symh-2001-02-19-2001-03-02.csv saved!
34591 9822
processed_data/2001/tec-symh-2001-03-04-2001-03-11.csv saved!
34892 5656
processed_data/2001/tec-symh-2001-03-12-2001-

In [60]:
# resample processed data
data_path = Path("processed_data/")

for year_folder in data_path.iterdir():
    if not year_folder.is_dir():
        continue
    print(year_folder)
    for df_path in year_folder.iterdir():
        if df_path.suffix != ".csv":
            continue
        if "5min" in df_path.name or "10min" in df_path.name:
            continue
        print(df_path, "starting ...")
        df = pd.read_csv(df_path, index_col=0, parse_dates=True)
        df_10min = df.resample(rule=timedelta(minutes=10)).asfreq().dropna()
        df_5min = df.resample(rule=timedelta(minutes=5)).asfreq().dropna()
        df_10min.to_csv(df_path.parent / (str(df_path.with_suffix("").name) + "-freq-10min.csv"))
        df_5min.to_csv(df_path.parent / (str(df_path.with_suffix("").name) + "-freq-5min.csv"))
        print(df_path, "done!")

processed_data/2004
processed_data/2004/tec-symh-2004-07-08-2004-07-11.csv starting ...
processed_data/2004/tec-symh-2004-07-08-2004-07-11.csv done!
processed_data/2004/tec-symh-2004-01-01-2004-03-17.csv starting ...
processed_data/2004/tec-symh-2004-01-01-2004-03-17.csv done!
processed_data/2004/tec-symh-2004-10-27-2004-12-27.csv starting ...
processed_data/2004/tec-symh-2004-10-27-2004-12-27.csv done!
processed_data/2004/tec-symh-2004-10-02-2004-10-05.csv starting ...
processed_data/2004/tec-symh-2004-10-02-2004-10-05.csv done!
processed_data/2004/tec-symh-2004-07-22-2004-07-27.csv starting ...
processed_data/2004/tec-symh-2004-07-22-2004-07-27.csv done!
processed_data/2004/tec-symh-2004-09-24-2004-10-01.csv starting ...
processed_data/2004/tec-symh-2004-09-24-2004-10-01.csv done!
processed_data/2004/tec-symh-2004-04-22-2004-05-31.csv starting ...
processed_data/2004/tec-symh-2004-04-22-2004-05-31.csv done!
processed_data/2004/tec-symh-2004-10-06-2004-10-27.csv starting ...
processed

In [28]:


Path("asd/asd/dds").mkdir(parents=True)

In [None]:
for year in [2000, 2001, 2002, 2003, 2004, 2005, 2017]:
    # load tec
    tec = pd.read_csv(f"parsed_data/tucu-year-{year}.csv", index_col=0, parse_dates=True)
    # load symh
    symh = data_utils.load_symh_wdc(f"sym/ASY-SYM-WDCformat-{year}.dat")
    # merge

    df = pd.merge(left=tec, 
        right=symh, 
        #left_on=tec_selection.index,
        right_on=symh.index, 
        how="left",left_index=True).loc[:,["tec", "symh"]]
