In [1]:
import os
import pandas as pd
from ruamel.yaml import YAML
from pathlib import Path
import time
import pyarrow.parquet as pq
import pyarrow as pa

# Initialize YAML parser with safe mode and duplicate key handling
yaml = YAML(typ='safe', pure=True)
yaml.allow_duplicate_keys = False

# Function to extract paths and values from YAML data
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((str(new_prefix), str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((str(new_prefix), str(item)))
    return paths



# Set the path to the workflows directory and output files
extracted_path = '/Users/aref/Desktop/PhD/Datasets/workflows/workflows/'
output_file = 'workflows_data.parquet'
log_file = 'error_log.txt'

# Initialize an empty DataFrame for storing results incrementally
columns = ['workflow_id', 'path', 'value']
batch_size = 500  # Process files in batches of this size

# Clear previous log file if it exists
if os.path.exists(log_file):
    os.remove(log_file)

# Start processing files
yaml_files = [f for f in os.listdir(extracted_path)]
total_files = len(yaml_files)
processed_files = 0

start_time = time.time()

# Process files in batches
for i in range(0, total_files, batch_size):
    batch_files = yaml_files[i:i + batch_size]
    batch_data = []

    for file in batch_files:
        file_path = os.path.join(extracted_path, file)
        workflow_id = file.split('.')[0]  # Assuming the file name is the workflow ID

        try:
            with open(file_path, 'r') as f:
                workflow_data = yaml.load(f)
                paths_and_values = extract_paths(workflow_data)

                # Add each path-value pair to the batch data
                for path, value in paths_and_values:
                    batch_data.append({'workflow_id': workflow_id, 'path': path, 'value': value})
        except Exception as e:
            # Log errors with details about problematic files
            with open(log_file, 'a') as log:
                log.write(f"Error processing file {file}: {str(e)}\n")
            continue

    # Convert batch data to DataFrame and append to parquet file incrementally
    if batch_data:
        df_batch = pd.DataFrame(batch_data, columns=columns)
        
        # Ensure all values in both 'path' and 'value' columns are strings
        df_batch['path'] = df_batch['path'].astype(str)
        df_batch['value'] = df_batch['value'].astype(str)
        
        # Fill NaN values with empty strings
        df_batch = df_batch.fillna('')
        
        table = pa.Table.from_pandas(df_batch)

    
        if not os.path.exists(output_file):
            pq.write_table(table, output_file)
    else:
        # Append to existing Parquet file
        with pq.ParquetWriter(output_file, table.schema, use_dictionary=True) as writer:
            writer.write_table(table)

    processed_files += len(batch_files)
    print(f"Processed {processed_files}/{total_files} files...")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Processing completed!")
print(f"Total number of workflows processed: {processed_files}")
print(f"Time taken: {elapsed_time:.2f} seconds")
print(f"Errors logged in: {log_file}")


Processed 500/2313606 files...
Processed 1000/2313606 files...
Processed 1500/2313606 files...
Processed 2000/2313606 files...
Processed 2500/2313606 files...
Processed 3000/2313606 files...
Processed 3500/2313606 files...
Processed 4000/2313606 files...
Processed 4500/2313606 files...
Processed 5000/2313606 files...
Processed 5500/2313606 files...
Processed 6000/2313606 files...
Processed 6500/2313606 files...
Processed 7000/2313606 files...
Processed 7500/2313606 files...
Processed 8000/2313606 files...
Processed 8500/2313606 files...
Processed 9000/2313606 files...
Processed 9500/2313606 files...
Processed 10000/2313606 files...
Processed 10500/2313606 files...
Processed 11000/2313606 files...
Processed 11500/2313606 files...
Processed 12000/2313606 files...
Processed 12500/2313606 files...
Processed 13000/2313606 files...
Processed 13500/2313606 files...
Processed 14000/2313606 files...
Processed 14500/2313606 files...
Processed 15000/2313606 files...
Processed 15500/2313606 files.

KeyboardInterrupt: 

In [5]:
pwd

'/Users/aref/Desktop/PhD/Codes/github-workflows-analysis'

In [2]:
import os
from ruamel.yaml import YAML
from pathlib import Path


    # Function to extract paths and values from YAML data
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((new_prefix, str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((new_prefix, str(item)))
    return paths

    

# Set the path to the workflows directory
extracted_path = '/Users/aref/Desktop/PhD/Datasets/workflows/workflows/'

# Get the first YAML file in the directory
yaml_files = [f for f in os.listdir(extracted_path)]
if not yaml_files:
    print("No YAML files found in the directory.")
    exit()

first_file = yaml_files[0]
file_path = os.path.join(extracted_path, first_file)

# Parse the YAML file
yaml = YAML()
with open(file_path, 'r') as file:
    workflow_data = yaml.load(file)

# Extract paths and values
paths_and_values = extract_paths(workflow_data)

# Print the results
print(f"Paths and Values for {first_file}:")
for path, value in paths_and_values:
    print(f"{path}: {value}")


Paths and Values for 3d193364f960e446c7ead86ea75d537054a5dd25a5cc5467cbac1c066ed27cc1:
name: darts PR merge workflow
on.push.branches[0]: master
jobs.lint.runs-on: ubuntu-latest
jobs.lint.steps[0].name: 1. Clone repository
jobs.lint.steps[0].uses: actions/checkout@v2
jobs.lint.steps[1].name: 2. Set up Python 3.10
jobs.lint.steps[1].uses: actions/setup-python@v1
jobs.lint.steps[1].with.python-version: 3.10
jobs.lint.steps[2].name: 3. Cache gradle distribution
jobs.lint.steps[2].uses: actions/cache@v2
jobs.lint.steps[2].with.path: ~/.gradle/wrapper/dists
jobs.lint.steps[2].with.key: tests-${{ runner.os }}-gradle-${{ hashFiles('gradle/wrapper/gradle-wrapper.properties') }}
jobs.lint.steps[3].name: 3.1 Cache gradle packages
jobs.lint.steps[3].uses: actions/cache@v2
jobs.lint.steps[3].with.path: ~/.gradle/caches
jobs.lint.steps[3].with.key: tests-${{ runner.os }}-gradle-${{ hashFiles('gradle/wrapper/gradle-wrapper.properties', 'build.gradle') }}
jobs.lint.steps[4].name: 4. Lint
jobs.lint.st