In [1]:
import os
import pandas as pd
from ruamel.yaml import YAML
from pathlib import Path
import time
import pyarrow.parquet as pq
import pyarrow as pa

# Initialize YAML parser with safe mode and duplicate key handling
yaml = YAML(typ='safe', pure=True)
yaml.allow_duplicate_keys = False

# Function to extract paths and values from YAML data
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((str(new_prefix), str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((str(new_prefix), str(item)))
    return paths



# Set the path to the workflows directory and output files
extracted_path = '/Users/aref/Desktop/PhD/Datasets/workflows/workflows/'
output_file = 'workflows_data.parquet'
log_file = 'error_log.txt'

# Initialize an empty DataFrame for storing results incrementally
columns = ['workflow_id', 'path', 'value']
batch_size = 500  # Process files in batches of this size

# Clear previous log file if it exists
if os.path.exists(log_file):
    os.remove(log_file)

# Start processing files
yaml_files = [f for f in os.listdir(extracted_path)]
total_files = len(yaml_files)
processed_files = 0

start_time = time.time()

# Process files in batches
for i in range(0, total_files, batch_size):
    batch_files = yaml_files[i:i + batch_size]
    batch_data = []

    for file in batch_files:
        file_path = os.path.join(extracted_path, file)
        workflow_id = file.split('.')[0]  # Assuming the file name is the workflow ID

        try:
            with open(file_path, 'r') as f:
                workflow_data = yaml.load(f)
                paths_and_values = extract_paths(workflow_data)

                # Add each path-value pair to the batch data
                for path, value in paths_and_values:
                    batch_data.append({'workflow_id': workflow_id, 'path': path, 'value': value})
        except Exception as e:
            # Log errors with details about problematic files
            with open(log_file, 'a') as log:
                log.write(f"Error processing file {file}: {str(e)}\n")
            continue

    # Convert batch data to DataFrame and append to parquet file incrementally
    if batch_data:
        df_batch = pd.DataFrame(batch_data, columns=columns)
        
        # Ensure all values in both 'path' and 'value' columns are strings
        df_batch['path'] = df_batch['path'].astype(str)
        df_batch['value'] = df_batch['value'].astype(str)
        
        # Fill NaN values with empty strings
        df_batch = df_batch.fillna('')
        
        table = pa.Table.from_pandas(df_batch)

    
        if not os.path.exists(output_file):
            pq.write_table(table, output_file)
    else:
        # Append to existing Parquet file
        with pq.ParquetWriter(output_file, table.schema, use_dictionary=True) as writer:
            writer.write_table(table)

    processed_files += len(batch_files)
    print(f"Processed {processed_files}/{total_files} files...")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Processing completed!")
print(f"Total number of workflows processed: {processed_files}")
print(f"Time taken: {elapsed_time:.2f} seconds")
print(f"Errors logged in: {log_file}")


Processed 500/2313606 files...
Processed 1000/2313606 files...
Processed 1500/2313606 files...
Processed 2000/2313606 files...
Processed 2500/2313606 files...
Processed 3000/2313606 files...
Processed 3500/2313606 files...
Processed 4000/2313606 files...
Processed 4500/2313606 files...
Processed 5000/2313606 files...
Processed 5500/2313606 files...
Processed 6000/2313606 files...
Processed 6500/2313606 files...
Processed 7000/2313606 files...
Processed 7500/2313606 files...
Processed 8000/2313606 files...
Processed 8500/2313606 files...
Processed 9000/2313606 files...
Processed 9500/2313606 files...
Processed 10000/2313606 files...
Processed 10500/2313606 files...
Processed 11000/2313606 files...
Processed 11500/2313606 files...
Processed 12000/2313606 files...
Processed 12500/2313606 files...
Processed 13000/2313606 files...
Processed 13500/2313606 files...
Processed 14000/2313606 files...
Processed 14500/2313606 files...
Processed 15000/2313606 files...
Processed 15500/2313606 files.

found duplicate anchor '&'
first occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/03ff01de6c918a19dfcc18d8914d54ef5ccc84f9e56a8d79a4e100a053227f5c", line 79, column 26
second occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/03ff01de6c918a19dfcc18d8914d54ef5ccc84f9e56a8d79a4e100a053227f5c", line 85, column 26
  item_value = self.compose_node(node, item_key)
found duplicate anchor '&'
first occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/03ff01de6c918a19dfcc18d8914d54ef5ccc84f9e56a8d79a4e100a053227f5c", line 85, column 26
second occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/03ff01de6c918a19dfcc18d8914d54ef5ccc84f9e56a8d79a4e100a053227f5c", line 91, column 26
  item_value = self.compose_node(node, item_key)


Processed 607500/2313606 files...
Processed 608000/2313606 files...
Processed 608500/2313606 files...
Processed 609000/2313606 files...
Processed 609500/2313606 files...
Processed 610000/2313606 files...
Processed 610500/2313606 files...
Processed 611000/2313606 files...
Processed 611500/2313606 files...
Processed 612000/2313606 files...
Processed 612500/2313606 files...
Processed 613000/2313606 files...
Processed 613500/2313606 files...
Processed 614000/2313606 files...
Processed 614500/2313606 files...
Processed 615000/2313606 files...
Processed 615500/2313606 files...
Processed 616000/2313606 files...
Processed 616500/2313606 files...
Processed 617000/2313606 files...
Processed 617500/2313606 files...
Processed 618000/2313606 files...
Processed 618500/2313606 files...
Processed 619000/2313606 files...
Processed 619500/2313606 files...
Processed 620000/2313606 files...
Processed 620500/2313606 files...
Processed 621000/2313606 files...
Processed 621500/2313606 files...
Processed 6220

found duplicate anchor 'goreleaser_version'
first occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/a845a7ea2931c0fbe40f75f9d277213b5ba4846f96b05047dbb73b49f238aa66", line 37, column 20
second occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/a845a7ea2931c0fbe40f75f9d277213b5ba4846f96b05047dbb73b49f238aa66", line 73, column 20
  item_value = self.compose_node(node, item_key)


Processed 1924000/2313606 files...
Processed 1924500/2313606 files...
Processed 1925000/2313606 files...
Processed 1925500/2313606 files...
Processed 1926000/2313606 files...
Processed 1926500/2313606 files...
Processed 1927000/2313606 files...
Processed 1927500/2313606 files...
Processed 1928000/2313606 files...
Processed 1928500/2313606 files...
Processed 1929000/2313606 files...
Processed 1929500/2313606 files...
Processed 1930000/2313606 files...
Processed 1930500/2313606 files...
Processed 1931000/2313606 files...
Processed 1931500/2313606 files...
Processed 1932000/2313606 files...
Processed 1932500/2313606 files...
Processed 1933000/2313606 files...
Processed 1933500/2313606 files...
Processed 1934000/2313606 files...
Processed 1934500/2313606 files...
Processed 1935000/2313606 files...
Processed 1935500/2313606 files...
Processed 1936000/2313606 files...
Processed 1936500/2313606 files...
Processed 1937000/2313606 files...
Processed 1937500/2313606 files...
Processed 1938000/23

found duplicate anchor 'goreleaser_version'
first occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/567b0b489187b00c89d15c505fbd76e5f765c9680c8858f184c4ec212bda189c", line 36, column 20
second occurrence   in "/Users/aref/Desktop/PhD/Datasets/workflows/workflows/567b0b489187b00c89d15c505fbd76e5f765c9680c8858f184c4ec212bda189c", line 72, column 20
  item_value = self.compose_node(node, item_key)


Processed 2291000/2313606 files...
Processed 2291500/2313606 files...
Processed 2292000/2313606 files...
Processed 2292500/2313606 files...
Processed 2293000/2313606 files...
Processed 2293500/2313606 files...
Processed 2294000/2313606 files...
Processed 2294500/2313606 files...
Processed 2295000/2313606 files...
Processed 2295500/2313606 files...
Processed 2296000/2313606 files...
Processed 2296500/2313606 files...
Processed 2297000/2313606 files...
Processed 2297500/2313606 files...
Processed 2298000/2313606 files...
Processed 2298500/2313606 files...
Processed 2299000/2313606 files...
Processed 2299500/2313606 files...
Processed 2300000/2313606 files...
Processed 2300500/2313606 files...
Processed 2301000/2313606 files...
Processed 2301500/2313606 files...
Processed 2302000/2313606 files...
Processed 2302500/2313606 files...
Processed 2303000/2313606 files...
Processed 2303500/2313606 files...
Processed 2304000/2313606 files...
Processed 2304500/2313606 files...
Processed 2305000/23

In [5]:
pwd

'/Users/aref/Desktop/PhD/Codes/github-workflows-analysis'

In [2]:
"""
Experimenting: getting number of distinct workflows ...

"""

import pandas as pd

# Load the Parquet file
df = pd.read_parquet('workflows_data.parquet')

# Group by 'workflow_id' and count unique workflows
unique_workflows = df['workflow_id'].nunique()

# Get total number of rows and columns
total_rows = len(df)
total_columns = len(df.columns)

# Display the information
print(f"Total number of unique workflows: {unique_workflows}")
print(f"Total number of rows: {total_rows}")
print(f"Total number of columns: {total_columns}")

# If you want to see the first few rows of the DataFrame
print(df.head())


Total number of unique workflows: 491
Total number of rows: 40011
Total number of columns: 3
                                         workflow_id                     path  \
0  3d193364f960e446c7ead86ea75d537054a5dd25a5cc54...                     name   
1  3d193364f960e446c7ead86ea75d537054a5dd25a5cc54...      on.push.branches[0]   
2  3d193364f960e446c7ead86ea75d537054a5dd25a5cc54...        jobs.lint.runs-on   
3  3d193364f960e446c7ead86ea75d537054a5dd25a5cc54...  jobs.lint.steps[0].name   
4  3d193364f960e446c7ead86ea75d537054a5dd25a5cc54...  jobs.lint.steps[0].uses   

                     value  
0  darts PR merge workflow  
1                   master  
2            ubuntu-latest  
3      1. Clone repository  
4      actions/checkout@v2  


In [2]:
import os
from ruamel.yaml import YAML
from pathlib import Path


    # Function to extract paths and values from YAML data
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((new_prefix, str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((new_prefix, str(item)))
    return paths

    

# Set the path to the workflows directory
extracted_path = '/Users/aref/Desktop/PhD/Datasets/workflows/workflows/'

# Get the first YAML file in the directory
yaml_files = [f for f in os.listdir(extracted_path)]
if not yaml_files:
    print("No YAML files found in the directory.")
    exit()

first_file = yaml_files[0]
file_path = os.path.join(extracted_path, first_file)

# Parse the YAML file
yaml = YAML()
with open(file_path, 'r') as file:
    workflow_data = yaml.load(file)

# Extract paths and values
paths_and_values = extract_paths(workflow_data)

# Print the results
print(f"Paths and Values for {first_file}:")
for path, value in paths_and_values:
    print(f"{path}: {value}")


Paths and Values for 3d193364f960e446c7ead86ea75d537054a5dd25a5cc5467cbac1c066ed27cc1:
name: darts PR merge workflow
on.push.branches[0]: master
jobs.lint.runs-on: ubuntu-latest
jobs.lint.steps[0].name: 1. Clone repository
jobs.lint.steps[0].uses: actions/checkout@v2
jobs.lint.steps[1].name: 2. Set up Python 3.10
jobs.lint.steps[1].uses: actions/setup-python@v1
jobs.lint.steps[1].with.python-version: 3.10
jobs.lint.steps[2].name: 3. Cache gradle distribution
jobs.lint.steps[2].uses: actions/cache@v2
jobs.lint.steps[2].with.path: ~/.gradle/wrapper/dists
jobs.lint.steps[2].with.key: tests-${{ runner.os }}-gradle-${{ hashFiles('gradle/wrapper/gradle-wrapper.properties') }}
jobs.lint.steps[3].name: 3.1 Cache gradle packages
jobs.lint.steps[3].uses: actions/cache@v2
jobs.lint.steps[3].with.path: ~/.gradle/caches
jobs.lint.steps[3].with.key: tests-${{ runner.os }}-gradle-${{ hashFiles('gradle/wrapper/gradle-wrapper.properties', 'build.gradle') }}
jobs.lint.steps[4].name: 4. Lint
jobs.lint.st