In [5]:
import os
import h5py
import re
import zipfile

def extract_zip(zip_file, extract_to):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
def count_missing_values(folder_path):
    count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith('.h5'):
            file_path = os.path.join(folder_path, filename)
            with h5py.File(file_path, 'r') as f:
                count += f.attrs.get('missing_values', 0)
    return count

def count_values_containing_string(folder_path, target_string):
    count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith('.h5'):
            file_path = os.path.join(folder_path, filename)
            with h5py.File(file_path, 'r') as f:
                for dataset in f.values():
                    for value in dataset:
                        if isinstance(value, str) and target_string in value:
                            count += 1
    return count

def count_attributes_with_conditions(folder_path):
    count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith('.h5'):
            file_path = os.path.join(folder_path, filename)
            with h5py.File(file_path, 'r') as f:
                for key in f.attrs.keys():
                    if re.match(r'^[_A-Z].*|.*[_A-Z]$', key):
                        count += 1
    return count

def count_timestamp_formats(folder_path):
    count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith('.h5'):
            file_path = os.path.join(folder_path, filename)
            with h5py.File(file_path, 'r') as f:
                for dataset in f.values():
                    if isinstance(dataset, h5py.Dataset):
                        for value in dataset:
                            if isinstance(value, str) and re.match(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', value):
                                count += 1
    return count

def count_bin_data_formats(folder_path):
    count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith('.h5'):
            file_path = os.path.join(folder_path, filename)
            with h5py.File(file_path, 'r') as f:
                for dataset in f.values():
                    if isinstance(dataset, h5py.Dataset) and dataset.dtype == 'object':
                        for value in dataset:
                            if isinstance(value, bytes):
                                count += 1
    return count

zip_file_path = r"C:\Users\golde\Downloads\dataset.zip"
extracted_folder_path = r"C:\Users\golde\Downloads\dataset"

# Extrahiere die ZIP-Datei
extract_zip(zip_file_path, extracted_folder_path)

# Passe den Ordnerpfad für die Analyse an
folder_path = os.path.join(extracted_folder_path, "dataset")

missing_values_count = count_missing_values(folder_path)
print(f"Anzahl der fehlenden Werte: {missing_values_count}")

easter_egg_count = count_values_containing_string(folder_path, 'Easter Egg')
print(f"Anzahl der Werte, die 'Easter Egg' enthalten: {easter_egg_count}")

attribute_count = count_attributes_with_conditions(folder_path)
print(f"Anzahl der Attribute mit Großbuchstaben am Anfang oder einem Unterstrich davor oder danach: {attribute_count}")

timestamp_count = count_timestamp_formats(folder_path)
print(f"Anzahl der Timestamp-Formate: {timestamp_count}")

bin_data_count = count_bin_data_formats(folder_path)
print(f"Anzahl der BinData-Formate: {bin_data_count}")


Anzahl der fehlenden Werte: 0
Anzahl der Werte, die 'Easter Egg' enthalten: 0
