In [None]:
import h5py
import os
import pandas as pd
import numpy as np
from filterServices import filterEasterEggs
import H5toDataframe as readH5

## Dateien in Pandas Dataframe schreiben (und velocity berechnen)

In [None]:
all_dataframes = {}
for file_name in os.listdir('datasetsRosen'):
    if file_name.endswith('.h5'):
        file_path = os.path.join('datasetsRosen', file_name)
        frame, frame_attrs = readH5.read_hdf5(file_path)
        if frame is not None:
            all_dataframes[file_name] = (frame, frame_attrs)
print(len(all_dataframes))

In [None]:
velocityMissingCount = 0
for name, (df, attrs) in all_dataframes.items():
    if not hasattr(df, 'velocity'):
       velocityMissingCount += 1
    else: 
        print(f"{name}: DataFrame Shape: {df.shape}, Attributes: {attrs}")
        print("First 5 lines of the columns:")
        # print(df[['velocity', 'defect_channel', 'distance', 'magnetization', 'timestamp', 'wall_thickness']].head())
        print(df.head())
        print("\n")
print(f'Fehlende Velocity Werte: {velocityMissingCount}')

# Datenüberprüfung

## Welche Konfigurationen und Instrumente gibt es in den Datensätzen?

In [None]:
confs_directory = 'datasetsRosen'
configurations = []
instruments = []
configAmount = 0
instAmount = 0

for idx, filename in enumerate(os.listdir(confs_directory)):
    file = h5py.File(confs_directory + '/' + filename)
    if 'data' in file:
        configName = file['data'].attrs['configuration']
        if file['data'].attrs['configuration']:
            configAmount += 1
        instrumentName = file['data'].attrs['instrument']
        if file['data'].attrs['instrument']:
            instAmount += 1
        if all(configName not in x for x in configurations):
            configurations.append(configName)
        if all(instrumentName not in x for x in instruments):
            instruments.append(instrumentName)
    if 'Daten' in file:
        configName = file['Daten'].attrs['configuration']
        if file['Daten'].attrs['configuration']:
            configAmount += 1
        instrumentName = file['Daten'].attrs['instrument']
        if file['Daten'].attrs['instrument']:
            instAmount += 1
        if all(configName not in x for x in configurations):
            configurations.append(configName)
        if all(instrumentName not in x for x in instruments):
            instruments.append(instrumentName)

print(f'Menge an Configurations: {configAmount}')
print('Configurations: ', configurations)
print(f'Menge an Instruments: {instAmount}')
print('Instruments: ', instruments)

## Wie viele falsche / fehlende Datensätze gibt es?

Prüft auf: 
    Negative Werte,
    Fehlende Datensätze / Falsche Key Bezeichnung

In [None]:
directory = 'datasetsRosen'
datasets = []

for filename in os.listdir(directory):
    file = h5py.File(directory + '/' + filename)
    for key in file.keys():
        datasets.append(file[key])

print(f'{len(datasets)} Datensätze erfolgreich in datasets-Array eingelesen')


# defect_channel, distance, magnetization, timestamp, velocity, wall_thickness
attribute = ("distance"
             "") # attribute ersetzen um verschiedene Attribute zu testen
count_negativeValue = 0
negativeValuesIdx = []
count_noDataset = 0
noDatasetIdx = []
eastereggCount = 0

for idx, dataset in enumerate(datasets):
    # print(dataset[attribute][...])
    if attribute in dataset:
        if b'Easteregg :)' not in dataset[attribute][...]:
            velocity_data = dataset[attribute][...].astype(float)
            if (velocity_data < 0).any():
                count_negativeValue += 1
                negativeValuesIdx.append(idx)
        else:
            eastereggCount += 1
    else:
        count_noDataset += 1
        noDatasetIdx.append(idx)
        
print(f"---------------------------------------------- {attribute} Analyse ----------------------------------------------")
print(f"Anzahl negative Werte: {count_negativeValue}\n Indizes: {negativeValuesIdx}\n")
print(f"Anzahl fehlender {attribute} Datensätze: {count_noDataset}\n Indizes: {noDatasetIdx}")
print("--------------------------------------------------------------------------------------------------------------")
print(f"Anzahl Eastereggs: {eastereggCount}")
print(datasets[242].keys())

In [ ]:
influx_dataframes = influxServices.download_all_dataframes('')

import seaborn as sns
import matplotlib.pyplot as plt

# influx_dataframes = filterEasterEggs(influx_dataframes)

# Daten sammeln
magnetization_values = []
wall_thickness_values = []

for name, (df, attrs) in influx_dataframes.items():
    if 'magnetization' in df.columns and 'wall_thickness' in df.columns:
        nan_magnetization = df.loc[df['magnetization'].isnull(), 'magnetization']
        nan_wall_thickness = df.loc[df['wall_thickness'].isnull(), 'wall_thickness']

        if not nan_magnetization.empty:
            print(f"NaN in Magnetization in DataFrame {name}: {nan_magnetization}")
        if not nan_wall_thickness.empty:
            print(f"NaN in wall_thickness in DataFrame {name}: {nan_wall_thickness}")

        # Entferne NaN-Werte, bevor du die Werte der Listen hinzufügst
        magnetization_values.extend(df['magnetization'].dropna())
        wall_thickness_values.extend(df['wall_thickness'].dropna())

# DataFrame erstellen
data = {'magnetization': magnetization_values, 'wall_thickness': wall_thickness_values}
merged_df = pd.DataFrame(data, dtype=np.float64)

# Seaborn-Plot erstellen
sns.scatterplot(x='wall_thickness', y='magnetization', data=merged_df)
plt.title('Magnetization vs. Wall Thickness')
plt.show()