In [1]:
"""
level 1
Script 4: wristband data (mainly: electrodermal activity. Can be modified later on to add other measures if required)
Reading the raw files from avro to csv
Extracting aggr eda, temp and acc (these can be used for plotting or other analyses as required)
"""

'\nScript 4: wristband data (mainly: electrodermal activity. Can be modified later on to add other measures if required)\nReading the raw files from avro to csv\nExtracting aggr eda, temp and acc (these can be used for plotting or other analyses as required)\n'

In [8]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import os
import warnings
from avro.datafile import DataFileReader
from avro.io import DatumReader

import json
import csv
import os
from datetime import datetime, timedelta
import pytz

In [9]:
"""
Then functions
"""
def utc_cet(utc):
    """
    can modify this function to convert the utc timestamp to whichever timezone needed
    """
    # Specific UTC timestamp in microseconds (μs)
    utc_timestamp_microseconds = utc  # Example timestamp

    # Convert the microseconds timestamp to seconds since the epoch
    utc_timestamp_seconds = utc_timestamp_microseconds / 1_000_000

    # Create a datetime object from the timestamp (assumed to be in UTC)
    utc_time = datetime.utcfromtimestamp(utc_timestamp_seconds)

    # Make the UTC time aware by setting its timezone to UTC
    utc_time = utc_time.replace(tzinfo=pytz.utc) 

    # Define the target timezone (CET)
    target_timezone = pytz.timezone('Europe/Berlin')  # Berlin is in the CET zone -> modify this to whichever other timezone required even if different from timezone of device system

    # Convert to the desired timezone
    cet_time = utc_time.astimezone(target_timezone)

    return cet_time

def read_avro(avro_file_path, output_dir):
    
    ## Read Avro file
    reader = DataFileReader(open(avro_file_path, "rb"), DatumReader())
    schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
    data= next(reader)
    ## Print the Avro schema - not a necessary step. remove comments on this code section if step necessary
    #print(schema)
    #print(" ")
    ## Export sensors data to csv files
    
    # Accelerometer
    acc = data["rawData"]["accelerometer"]
    timestamp_cet = utc_cet(acc["timestampStart"]) 
    mod_acc_file = '_'.join([str(timestamp_cet).split("+")[0], 'accelerometer.csv'])
    acc_file = mod_acc_file.replace(":", "_").replace(".", "_", 1).replace(" ", "_")
    timestamp = [round(acc["timestampStart"] + i * (1e6 / acc["samplingFrequency"])) #need to check why this rounding and adding is necessary. For this and other data throughout the function
                for i in range(len(acc["x"]))]
    timestamp_CET = [utc_cet(timestamp[i])
                for i in range(len(acc["x"]))] #for every utc timestamp produced in microseconds it is converted to cet timezone
    with open(os.path.join(output_dir, acc_file), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["unix_timestamp", "CET_timestamp", "x", "y", "z"])
        writer.writerows([[ts, tsc, x, y, z] for ts, tsc, x, y, z in zip(timestamp, timestamp_CET, acc["x"], acc["y"], acc["z"])])
    
    # Gyroscope
    gyro = data["rawData"]["gyroscope"]
    timestamp_cet = utc_cet(gyro["timestampStart"]) 
    mod_gyro_file = '_'.join([str(timestamp_cet).split("+")[0], 'gyroscope.csv'])
    gyro_file = mod_gyro_file.replace(":", "_").replace(".", "_", 1).replace(" ", "_") #debug this
    timestamp = [round(gyro["timestampStart"] + i * (1e6 / gyro["samplingFrequency"]))
                for i in range(len(gyro["x"]))]
    timestamp_CET = [utc_cet(timestamp[i])
                for i in range(len(gyro["x"]))] #for every utc timestamp produced in microseconds it is converted to cet timezone
    with open(os.path.join(output_dir, gyro_file), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["unix_timestamp", "CET_timestamp", "x", "y", "z"])
        writer.writerows([[ts, tsc, x, y, z] for ts, tsc, x, y, z in zip(timestamp, timestamp_CET, gyro["x"], gyro["y"], gyro["z"])])
    
    # Eda
    eda = data["rawData"]["eda"]
    timestamp_cet = utc_cet(eda["timestampStart"])
    mod_eda_file = '_'.join([str(timestamp_cet).split("+")[0], 'eda_cet.csv'])
    eda_file = mod_eda_file.replace(":", "_").replace(".", "_", 1).replace(" ", "_")
    timestamp = [round(eda["timestampStart"] + i * (1e6 / eda["samplingFrequency"])) #(1e6 / eda["samplingFrequency"]) = 10^6 x (1/4) = 10^6 x 0.25s = 250,000 microseconds. Therefore all the subsequent utc timestamps generated should be timestamp start (which is in microseconds) +multiples of 250,000 microseconds and so the answers are all in microseconds
                for i in range(len(eda["values"]))]
    timestamp_CET = [utc_cet(timestamp[i])
                for i in range(len(eda["values"]))] #for every utc timestamp produced in microseconds it is converted to cet timezone
    with open(os.path.join(output_dir, eda_file), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["unix_timestamp", "CET_timestamp", "eda"])
        writer.writerows([[ts, tsc, eda] for ts, tsc, eda in zip(timestamp, timestamp_CET, eda["values"])])

    
    # Temperature
    tmp = data["rawData"]["temperature"]
    timestamp_cet = utc_cet(tmp["timestampStart"])
    mod_tmp_file = '_'.join([str(timestamp_cet).split("+")[0], 'temperature_cet.csv'])
    tmp_file = mod_tmp_file.replace(":", "_").replace(".", "_", 1).replace(" ", "_")
    timestamp = [round(tmp["timestampStart"] + i * (1e6 / tmp["samplingFrequency"]))
                for i in range(len(tmp["values"]))]
    timestamp_CET = [utc_cet(timestamp[i])
                for i in range(len(tmp["values"]))] #for every utc timestamp produced in microseconds it is converted to cet timezone
    with open(os.path.join(output_dir, tmp_file), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["unix_timestamp", "CET_timestamp", "temperature"])
        writer.writerows([[ts, tsc, tmp] for ts, tsc, tmp in zip(timestamp, timestamp_CET, tmp["values"])])
    
    # Tags
    tags = data["rawData"]["tags"] #need a diff naming strategy. So - using utc timestamp on filename instead
    file_timestamp = '_'.join([avro_file_path.split("\\")[-1].split(".")[0], 'tags.csv']) #will be reusing this for systolic peaks
    with open(os.path.join(output_dir, file_timestamp), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["tags_timestamp"])
        writer.writerows([[tag] for tag in tags["tagsTimeMicros"]])
    
    # BVP
    bvp = data["rawData"]["bvp"]
    timestamp_cet = utc_cet(bvp["timestampStart"])
    mod_bvp_file = '_'.join([str(timestamp_cet).split("+")[0], 'bvp.csv'])
    bvp_file = mod_bvp_file.replace(":", "_").replace(".", "_", 1).replace(" ", "_")
    timestamp = [round(bvp["timestampStart"] + i * (1e6 / bvp["samplingFrequency"]))
                for i in range(len(bvp["values"]))]
    timestamp_CET = [utc_cet(timestamp[i])
                for i in range(len(bvp["values"]))] #for every utc timestamp produced in microseconds it is converted to cet timezone
    with open(os.path.join(output_dir, bvp_file), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["unix_timestamp", "CET_timestamp", "bvp"])
        writer.writerows([[ts, tsc, bvp] for ts, tsc, bvp in zip(timestamp, timestamp_CET, bvp["values"])])
   
    # Systolic peaks
    sps = data["rawData"]["systolicPeaks"] #need a diff naming strategy. So - using utc timestamp on filename instead as in Tags
    file_timestamp_sps = '_'.join([avro_file_path.split("\\")[-1].split(".")[0], 'systolic_peaks.csv'])
    with open(os.path.join(output_dir, file_timestamp_sps), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["systolic_peak_timestamp"])
        writer.writerows([[sp] for sp in sps["peaksTimeNanos"]])
    
    # Steps
    steps = data["rawData"]["steps"]
    timestamp_cet = utc_cet(steps["timestampStart"])
    mod_steps_file = '_'.join([str(timestamp_cet).split("+")[0], 'steps.csv'])
    steps_file = mod_steps_file.replace(":", "_").replace(".", "_", 1).replace(" ", "_")
    timestamp = [round(steps["timestampStart"] + i * (1e6 / steps["samplingFrequency"]))
                for i in range(len(steps["values"]))]
    timestamp_CET = [utc_cet(timestamp[i])
                for i in range(len(steps["values"]))] #for every utc timestamp produced in microseconds it is converted to cet timezone
    with open(os.path.join(output_dir, steps_file), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["unix_timestamp", "CET_timestamp", "steps"])
        writer.writerows([[ts, tsc, step] for ts, tsc, step in zip(timestamp, timestamp_CET, steps["values"])])

    

In [10]:
"""
Code block to reference the necessary folders
"""
parentfolder = input('enter the subject folder: ') 

"""
names of folders to be used
"""
folder1 = 'empatica'
folder2 = 'saved_figures'

folder11 = 'aggr_p_min'
folder12 = 'avro_files'
folder13 = 'avro2csv'

enter the subject folder:  C:\Users\Ananya Rao\Documents\CAM_LMU\LMU_Stream_HC_002_4


In [11]:
"""
re-running the essential steps of the code above for files of other days
"""
"""
need to generate files for each and every avro file generated for the day. 
first generate all the new required directories
"""
for subfolder in os.listdir(parentfolder):
    if subfolder.endswith('_d'):
        for file in os.listdir(os.path.join(parentfolder, subfolder, folder1, folder12)):
            Dir= os.mkdir(os.path.join(parentfolder, subfolder, folder1, folder13, file.split(".")[0]))

In [12]:
"""
now generate all the new required files to be stored within the newly created directories
"""
for subfolder in os.listdir(parentfolder):
    if subfolder.endswith('_d'):
        
        file_list = os.listdir(os.path.join(parentfolder, subfolder, folder1, folder12))
        op_list = os.listdir(os.path.join(parentfolder, subfolder, folder1, folder13))
        for i in range(0,len(file_list)):
            ipFile = os.path.join(parentfolder, subfolder, folder1, folder12, file_list[i])
            opDir = os.path.join(parentfolder, subfolder, folder1, folder13, op_list[i])
            read_avro(ipFile, opDir) 
               

OSError: [Errno 28] No space left on device