In [None]:
import os
import pandas as pd
from ruamel.yaml import YAML
from pathlib import Path
import time
import pyarrow.parquet as pq
import pyarrow as pa

# Initialize YAML parser with safe mode and duplicate key handling
yaml = YAML(typ='safe', pure=True)
yaml.allow_duplicate_keys = False

# Function to extract paths and values from YAML data
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((str(new_prefix), str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((str(new_prefix), str(item)))
    return paths



# Set the path to the workflows directory and output files
extracted_path = '/Users/aref/Desktop/PhD/Datasets/workflows/workflows/'
output_file = 'workflows_data.parquet'
log_file = 'error_log.txt'

# Initialize an empty DataFrame for storing results incrementally
columns = ['workflow_id', 'path', 'value']
batch_size = 500  # Process files in batches of this size

# Clear previous log file if it exists
if os.path.exists(log_file):
    os.remove(log_file)

# Start processing files
yaml_files = [f for f in os.listdir(extracted_path)]
total_files = len(yaml_files)
processed_files = 0

start_time = time.time()

# Process files in batches
for i in range(0, total_files, batch_size):
    batch_files = yaml_files[i:i + batch_size]
    batch_data = []

    for file in batch_files:
        file_path = os.path.join(extracted_path, file)
        workflow_id = file.split('.')[0]  # Assuming the file name is the workflow ID

        try:
            with open(file_path, 'r') as f:
                workflow_data = yaml.load(f)
                paths_and_values = extract_paths(workflow_data)

                # Add each path-value pair to the batch data
                for path, value in paths_and_values:
                    batch_data.append({'workflow_id': workflow_id, 'path': path, 'value': value})
        except Exception as e:
            # Log errors with details about problematic files
            with open(log_file, 'a') as log:
                log.write(f"Error processing file {file}: {str(e)}\n")
            continue

    # Convert batch data to DataFrame and append to parquet file incrementally
    if batch_data:
        df_batch = pd.DataFrame(batch_data, columns=columns)
        
        # Ensure all values in both 'path' and 'value' columns are strings
        df_batch['path'] = df_batch['path'].astype(str)
        df_batch['value'] = df_batch['value'].astype(str)
        
        # Fill NaN values with empty strings
        df_batch = df_batch.fillna('')
        
        table = pa.Table.from_pandas(df_batch)

    
        if not os.path.exists(output_file):
            pq.write_table(table, output_file)
    else:
        # Append to existing Parquet file
        with pq.ParquetWriter(output_file, table.schema, use_dictionary=True) as writer:
            writer.write_table(table)

    processed_files += len(batch_files)
    print(f"Processed {processed_files}/{total_files} files...")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Processing completed!")
print(f"Total number of workflows processed: {processed_files}")
print(f"Time taken: {elapsed_time:.2f} seconds")
print(f"Errors logged in: {log_file}")


In [None]:
pwd

In [None]:
"""
Experimenting: getting number of distinct workflows ...

"""

import pandas as pd

# Load the Parquet file
df = pd.read_parquet('workflows_data.parquet')

# Group by 'workflow_id' and count unique workflows
unique_workflows = df['workflow_id'].nunique()

# Get total number of rows and columns
total_rows = len(df)
total_columns = len(df.columns)

# Display the information
print(f"Total number of unique workflows: {unique_workflows}")
print(f"Total number of rows: {total_rows}")
print(f"Total number of columns: {total_columns}")

# If you want to see the first few rows of the DataFrame
print(df.head())


In [None]:
import os
from ruamel.yaml import YAML
from pathlib import Path


    # Function to extract paths and values from YAML data
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((new_prefix, str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((new_prefix, str(item)))
    return paths

    

# Set the path to the workflows directory
extracted_path = '/Users/aref/Desktop/PhD/Datasets/workflows/workflows/'

# Get the first YAML file in the directory
yaml_files = [f for f in os.listdir(extracted_path)]
if not yaml_files:
    print("No YAML files found in the directory.")
    exit()

first_file = yaml_files[0]
file_path = os.path.join(extracted_path, first_file)

# Parse the YAML file
yaml = YAML()
with open(file_path, 'r') as file:
    workflow_data = yaml.load(file)

# Extract paths and values
paths_and_values = extract_paths(workflow_data)

# Print the results
print(f"Paths and Values for {first_file}:")
for path, value in paths_and_values:
    print(f"{path}: {value}")
