Insert Data in Databse and start unification from Keys etc.

Manage Imports

In [2]:
import os
import h5py
from pymongo import MongoClient
from dotenv import load_dotenv
from zoneinfo import ZoneInfo
from datetime import datetime

Get Database URI from .env File and setup other variables. Be sure to insert the DB URI for a MongoDB Database

In [15]:
load_dotenv()

uri = os.environ['DB_URI']

# Verzeichnis, in dem sich die .h5 Dateien befinden
directory = '../data'

# MongoDB Datenbank-Verbindung
client = MongoClient(uri)
db = client['rosen']
collection = db['big_data_2']


In [9]:
def calc_average(value1, value2):
    return (value1 + value2) / 2

In [10]:
def get_data_from_group(group):
    data = {}
    for key, item in group.items():
        new_key = key.lower()
        if new_key.startswith("_") or new_key.endswith("_"):
            new_key = new_key.replace("_", "")
        if new_key == "wallthickness":
            print("fixing \"wallthickness\" name")
            new_key = "wall_thickness"
        
        data[new_key] = item[()].tolist()
        
        #handle inconsistencies (for now only binary data)
        for index, i in enumerate(data[new_key]):
            # decode binary data
            if isinstance(i, (bytes, bytearray)):
                try:
                    # try parse as float
                    data[new_key][index] = float(i)
                except ValueError:
                    if i == b'Easteregg :)':
                        print(f"[{new_key}] hit {i}, taking average between last and next value")
                        prev = data[new_key][index-1]
                        nextItem = data[new_key][index+1]
                        # try calculating average between last and next val (does not work on strings)
                        try:
                            data[new_key][index] = calc_average(float(prev), float(nextItem))
                        except ValueError:
                            print("error on calculating average, leaving as decoded string")
                            # on error decode as string
                            data[new_key][index] = str(i.decode())
                    else:
                        if new_key == "timestamp":
                            try:
                                #timestamp in binary data as date string -> convert to unix timestamp
                                timezone = ZoneInfo("UTC")
                                date = datetime.strptime(str(i.decode()), '%Y-%m-%dT%H:%M:%S').replace(tzinfo=timezone) #2014-03-24T03:58:1
                                # print(f"converted to timestamp: {i} is {date.timestamp()}")
                                data[new_key][index] = date.timestamp()
                            except ValueError:
                                print(f"error converting to timestamp {i}")
                        else:
                            print(f"[{new_key}] missed binary data {i}, taking average between last and next value")
                            prev = data[new_key][index-1]
                            nextItem = data[new_key][index+1]
                            # try calculating average between last and next val (does not work on strings)
                            try:
                                data[new_key][index] = calc_average(float(prev), float(nextItem))
                            except ValueError:
                                print("error on calculating average, leaving as decoded string")
                                # on error decode as string
                                data[new_key][index] = str(i.decode())
            else:
                # not binary handling
                #everything that is not float
                # case does not exist in dataset
                if not isinstance(i, float):
                    print(f"value is not binary and not float: {i}")
    return data

In [13]:
def process_h5_file(file_path, insert_into_db = False):
    with h5py.File(file_path, 'r') as h5file:
        #data = traverse_group(h5file)  # Start der Rekursion von der Wurzelgruppe
        
        for name, group in h5file.items():
            if name.lower() == 'data' or name.lower() == 'daten':
                points = get_data_from_group(group)
                
                #fix lengths
                # find longest
                # max_len = 0
                # for point in points:
                #     if len(points[point]) > max_len:
                #         max_len = len(points[point]) 
                
                 # -> longest is always 1000
                max_len = 1000
                for key, point in points.items():
                    if len(point) < max_len:
                        point.extend([float("NaN")] * (max_len - len(point)))
                 
                data = {'_id': group.attrs['id'], "instrument": group.attrs['instrument'], "configuration": group.attrs['configuration']}
                data.update(points)
                #print(data)
                if insert_into_db:
                    collection.insert_one(data)  # Daten in MongoDB einfügen
            else:
                print(f"Unknown group: {name}")

In [14]:
# Durchlaufe alle .h5 Dateien im angegebenen Verzeichnis
for filename in os.listdir(directory):
    if filename.endswith('.h5'):
        process_h5_file(os.path.join(directory, filename), insert_into_db = True)
        #break
        
print('Verarbeitung abgeschlossen.')

[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', taking average between last and next value
[magnetization] hit b'Easteregg :)', tak