In [34]:
#install packages
#!pip install pandas
#!pip install numpy
#!pip install matplotlib
#!pip install seaborn
#!pip install zarr
#!pip install os
#!pip install pyarrow
#!pip install fastparquet
#!pip install scipy


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zarr
import os
from scipy.signal import spectrogram


In [15]:
#defining path

path = r"(fill in)raw_data\"
zarr_path = r"(fill in) zarr\data.zarr"


In [16]:
#test of the path is correct
if os.path.exists(path + "vibration_101-20240512102014.txt"):
    print("The file exists")
else:
    raise FileNotFoundError(f"The file {path + 'vibration_101-20240512102014.txt'} does not exist")

The file exists


# Load all functions

In [17]:
# Load text file
def load_data(file_name, header='infer'):
    file_path = (path + file_name)
    df = pd.read_csv(file_path, header=header)
    return df
# Example usage
# df = load_data(path, 'vibration_101-20240512102014.txt', header=None)

# Transpose the vibration data
def tranpose_vibration(df):
    df_import = pd.DataFrame()
    timestamps = []
    data_values = [df.iloc[ii, i + 2] for ii in range(100) for i in range(25000)]
    for i in range(100):
        timestamps.append(df.iloc[i, 0])
        timestamps.extend([None] * 24999)
    df_import['Timestamp'] = timestamps
    df_import['Data'] = data_values
    return df_import

# Example usage
# df = tranpose_vibration(load_data(path, 'vibration_101-20240512102014.txt', header=None))

# load npy file
def load_npy(file_name):
    file_to_load = os.path.join(path, file_name)
    npy_data = np.load(file_to_load)
    df = pd.DataFrame(npy_data)
    df = df.T
    return df
# Example usage
# load_npy(path, "fibers_1_20240508111511.npy")

# load csv file
def load_csv(file_name):
    file_to_load = os.path.join(path, file_name)
    return pd.read_csv(file_to_load)

# Example usage
# df = load_csv("environment_temperature-1715150570.csv")

# Function to sanitize column names
def sanitize_column_names(df):
    df.columns = df.columns.str.replace(r"[#\[\]\s]", "_", regex=True)
    return df



In [39]:
store = zarr.open(zarr_path, mode='r')
print(store.tree())

In [18]:
# Function to save a DataFrame to Zarr
def save_to_zarr(dataframe, zarr_path, group_name, sanitize=True):
    """
    Save a DataFrame to a Zarr file. If the Zarr group already exists, append the data.
    
    Parameters:
        dataframe (pd.DataFrame): The DataFrame to save.
        zarr_path (str): Path to the Zarr file.
        group_name (str): Name of the group in the Zarr file.
        sanitize (bool): Whether to sanitize the column names.
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
    
    # Open or create a Zarr group
    zarr_store = zarr.open(zarr_path, mode='a')  # 'a' mode allows appending or creating new
    group = zarr_store.require_group(group_name)
    
    # Sanitize column names
    if sanitize:
        dataframe = sanitize_column_names(dataframe)
    
    # If the group already has datasets, append to them
    if len(group):
        for column in dataframe.columns:
            if column in group:
                # Append new data to the existing dataset
                group[column].append(dataframe[column].to_numpy())
            else:
                # Add a new dataset if it doesn't exist
                group.create_dataset(
                    column,
                    data=dataframe[column].to_numpy(),
                    chunks=True,
                    overwrite=False,
                    appendable=True,
                )
    else:
        # If the group is empty, initialize it with the DataFrame data
        for column in dataframe.columns:
            group.create_dataset(
                column,
                data=dataframe[column].to_numpy(),
                chunks=True,
                overwrite=False,
            )

    # Save metadata for column order and types
    group.attrs['columns'] = list(dataframe.columns)
    group.attrs['dtypes'] = dataframe.dtypes.apply(str).to_dict()

    print(f"Data saved to Zarr group '{group_name}'")


In [13]:
# Save environment_rpm to Zarr
for file_name in os.listdir(path):
    if file_name.startswith("environment_rpm") and file_name.endswith(".csv"):
        dataframe = load_csv(file_name)
        group_name = "environment_rpm"
        print(os.path.splitext(file_name)[0])
        save_to_zarr(dataframe, zarr_path, group_name)

# Save environment_temperature to Zarr
for file_name in os.listdir(path):
    if file_name.startswith("environment_temperature") and file_name.endswith(".csv"):
        dataframe = load_csv(file_name)
        group_name = "environment_temperature"
        print(os.path.splitext(file_name)[0])
        save_to_zarr(dataframe, zarr_path, group_name)

# Save load_temperature-20240508084312.txt to Zarr
for file_name in os.listdir(path):
    if file_name.startswith("load_temperature") and file_name.endswith(".txt"):
        dataframe = load_data(file_name, header=None)
        group_name = "load_temperature"
        print(os.path.splitext(file_name)[0])
        save_to_zarr(dataframe, zarr_path, group_name, sanitize=False)


In [19]:
def save_fibers_to_zarr(fiber_number):
    processed_files_count = 0

    files_to_process = [
    file_name for file_name in os.listdir(path) 
    if file_name.startswith(f"fibers_{fiber_number}") and file_name.endswith(".npy")
    ]

    total_files = len(files_to_process)

    print(f"Total files to process: {total_files}")

    for file_name in files_to_process:

        dataframe = load_npy(file_name)
        
        group_name = f"fibers_{fiber_number}"
        print(f"Processing file {processed_files_count + 1} of {total_files}: {os.path.splitext(file_name)[0]}")

        save_to_zarr(dataframe, zarr_path, group_name, sanitize=False)

        processed_files_count += 1
    print(f"Processing complete. Total files processed: {processed_files_count}")





In [20]:
save_fibers_to_zarr(1)
save_fibers_to_zarr(2)

Total files to process: 2015
Processing file 1 of 2015: fibers_1_20240508084403
Data saved to Zarr group 'fibers_1'
Processing file 2 of 2015: fibers_1_20240508084554
Data saved to Zarr group 'fibers_1'
Processing file 3 of 2015: fibers_1_20240508084745
Data saved to Zarr group 'fibers_1'
Processing file 4 of 2015: fibers_1_20240508084935
Data saved to Zarr group 'fibers_1'
Processing file 5 of 2015: fibers_1_20240508085126
Data saved to Zarr group 'fibers_1'
Processing file 6 of 2015: fibers_1_20240508085316
Data saved to Zarr group 'fibers_1'
Processing file 7 of 2015: fibers_1_20240508085507
Data saved to Zarr group 'fibers_1'
Processing file 8 of 2015: fibers_1_20240508085657
Data saved to Zarr group 'fibers_1'
Processing file 9 of 2015: fibers_1_20240508085848
Data saved to Zarr group 'fibers_1'
Processing file 10 of 2015: fibers_1_20240508090039
Data saved to Zarr group 'fibers_1'
Processing file 11 of 2015: fibers_1_20240508090229
Data saved to Zarr group 'fibers_1'
Processing f

In [11]:
def save_vibration_to_zarr(vibration_number):
    processed_files_count = 0

    files_to_process = [
        file_name for file_name in os.listdir(path) 
        if file_name.startswith(f"vibration_{vibration_number}") and file_name.endswith(".txt") and not file_name.endswith("oct.txt")
    ]

    total_files = len(files_to_process)

    print(f"Total files to process: {total_files}")

    for file_name in files_to_process:
        print("Starting to process file: ", file_name)

        # Attempt to process the file inside a try-except block
        try:
            raw_data = load_data(file_name, header=None)
            dataframe = tranpose_vibration(raw_data)
        except IndexError as e:
            print(f"Skipping file '{file_name}' due to IndexError: {e}")
            continue

        group_name = f"vibration_{vibration_number}"
        print(f"Processing file {processed_files_count + 1} of {total_files}: {os.path.splitext(file_name)[0]}")

        # Save to Zarr only if processing was successful
        save_to_zarr(dataframe, zarr_path, group_name, sanitize=False)
        processed_files_count += 1

    print(f"Processing complete. Total files processed: {processed_files_count}")


In [12]:
save_vibration_to_zarr(101)
save_vibration_to_zarr(102)
save_vibration_to_zarr(103)


Total files to process: 102
Starting to process file:  vibration_101-20240508084311.txt
Processing file 1 of 102: vibration_101-20240508084311
Data saved to Zarr group 'vibration_101'
Starting to process file:  vibration_101-20240508102412.txt
Processing file 2 of 102: vibration_101-20240508102412
Data saved to Zarr group 'vibration_101'
Starting to process file:  vibration_101-20240508120511.txt
Processing file 3 of 102: vibration_101-20240508120511
Data saved to Zarr group 'vibration_101'
Starting to process file:  vibration_101-20240508134610.txt
Processing file 4 of 102: vibration_101-20240508134610
Data saved to Zarr group 'vibration_101'
Starting to process file:  vibration_101-20240508152709.txt
Processing file 5 of 102: vibration_101-20240508152709
Data saved to Zarr group 'vibration_101'
Starting to process file:  vibration_101-20240508170808.txt
Processing file 6 of 102: vibration_101-20240508170808
Data saved to Zarr group 'vibration_101'
Starting to process file:  vibration