In [2]:
import json
import os
import pandas as pd
import numpy as np
import glob

print(pd.__version__, np.__version__)


1.4.1 1.19.4


In [61]:
def all_csv(indir_name, df_list):
    for i, f in enumerate(glob.glob(os.path.join(f"{indir_name}","**", "*entries*.csv"), recursive=True)):
        head, tail = os.path.split(f)
        print(i, tail)
        # df = pd.read_csv(f, header=0, parse_dates=[4], index_col=0)
        df = pd.read_csv(f, header=0, index_col=0)
        if len(df)<1:
            print(f"no entries: {head}, {tail}") 
            continue
        #column_type = str(df["dateString"].dtype)
        #if "datetime" not in str(column_type).lower(): 
        #    raise Exception(f"there was an import error: {column_type}")
        """
        df["unix_timestamp"] = df["date"]
        df["datetime"] = pd.to_datetime(df["dateString"])
        df.info()
        print(df)
        
        df["date"] = df["datetime"].dt.date
        df.info()
        print(df)
        """
        
        df["unix_timestamp"] = df["date"]
        # unix_timestamp in ms is a 13 digit number, in s it is a 10 digit number (in 2022)
        if not ((np.log10(df["unix_timestamp"]) > 12) & (np.log10(df["unix_timestamp"]) < 13)).all(): 
            raise ValueError("expected a 13 digit unix timestamp, but got a {} digit number.".format(np.int(np.log10(df["unix_timestamp"][0])) + 1))
        #df[["date","time"]] = df["dateString"].str.split("T", 1, expand=True)
        df["datetime_utc"] = pd.to_datetime(df['unix_timestamp'], unit='ms', utc=True)
        df["date"] = df["datetime_utc"].dt.date
        df_mod = df[["date","sgv"]]
        df2 = df_mod.groupby("date", as_index=False).agg(["mean", "std", "min", "max", "count"])
        # df2.reset_index()
        columns = []
        for col in df2.columns:
            columns.append(f"{col[0]}_{col[1]}")
        df2.columns = columns
        df2.fillna(value=0, inplace=True)  #
        df2.reset_index(inplace=True)
        df2["filename"] = tail
        fn_components = tail.split("_")
        if "OPENonOH" in head: 
            df2["user_id"] = fn_components[0]
            df2["second_id"] = fn_components[1]
        elif "OpenAPS" in head:
            df2["user_id"] = fn_components[0]
            df2["second_id"] = np.nan
        else: 
            raise ValueError("dataset needs to be in file path, it should be either 'OPENonOH' or OpenAPS")
        
        df_list.append(df2)
        


In [62]:
def main(dataset : str):
    indir = f"/home/reinhold/Daten/OPEN/{dataset}_Data/csv_per_measurement"
    outdir = f"/home/reinhold/Daten/OPEN/{dataset}_Data/csv_per_day/"
    outfile_name = f"entries_{dataset}.csv"
    df_list = []
    all_csv(indir, df_list)
    df = pd.concat(df_list, axis=0)
    #print(df)
    #df.info()
    df.to_csv(os.path.join(outdir, outfile_name))
    print(os.path.join(outdir, outfile_name) + " created")


In [63]:
dataset = "OpenAPS"
main(dataset)

0 56568290_entries_2019-05-10_to_2019-08-10.csv
no entries: /home/reinhold/Daten/OPEN/OpenAPS_Data/csv_per_measurement, 56568290_entries_2019-05-10_to_2019-08-10.csv
1 88252802_entries__to_2021-07-31.csv
2 79526193_entries_2017-10-14_to_2018-02-19.csv
3 64024750_entries__to_2018-07-03.csv
no entries: /home/reinhold/Daten/OPEN/OpenAPS_Data/csv_per_measurement, 64024750_entries__to_2018-07-03.csv
4 71236754_entries.csv
5 28608066_entries__to_2018-06-04.csv
6 66019205_entries_2017-10-09_to_2017-11-19.csv
no entries: /home/reinhold/Daten/OPEN/OpenAPS_Data/csv_per_measurement, 66019205_entries_2017-10-09_to_2017-11-19.csv
7 88004055_entries__to_2018-12-06.csv
no entries: /home/reinhold/Daten/OPEN/OpenAPS_Data/csv_per_measurement, 88004055_entries__to_2018-12-06.csv
8 89032650_entries_2020-01-15_to_2020-08-26.csv
9 96805916_entries.csv
no entries: /home/reinhold/Daten/OPEN/OpenAPS_Data/csv_per_measurement, 96805916_entries.csv
10 37948668_entries_2018-03-19_to_2018-06-06.csv
11 67359234_entr

In [64]:
dataset = "OPENonOH"
main(dataset)


0 10284147_23563672_entries__to_2020-10-23.csv
1 77667622_29269_entries__to_2018-01-27.csv
no entries: /home/reinhold/Daten/OPEN/OPENonOH_Data/csv_per_measurement, 77667622_29269_entries__to_2018-01-27.csv
2 08814820_16497270_entries_2020-03-23_to_2020-04-07.csv
3 82464452_21907694_entries__to_2020-09-22.csv
no entries: /home/reinhold/Daten/OPEN/OPENonOH_Data/csv_per_measurement, 82464452_21907694_entries__to_2020-09-22.csv
4 02773391_23772108_entries__to_2020-10-30.csv
5 93698253_24004237_entries__to_2020-11-09.csv
6 12826270_114520_entries__to_2018-09-12.csv
7 16184009_21761038_entries__to_2020-09-16.csv
8 13914709_23796834_entries__to_2020-10-31.csv
no entries: /home/reinhold/Daten/OPEN/OPENonOH_Data/csv_per_measurement, 13914709_23796834_entries__to_2020-10-31.csv
9 74373797_24003753_entries_2020-10-12_to_2020-11-09.csv
10 14844648_481457_entries__to_2018-11-22.csv
11 13788197_23089212_entries__to_2020-10-14.csv
12 77667622_21901055_entries__to_2020-09-22.csv
13 13914709_23796813_e