In [None]:
import pandas as pd

# Adjust the path if necessary
metadata_path = './data-raw/Workflows-Metadata/workflows.csv.gz'

# Read the first few lines of the compressed CSV
df_workflows = pd.read_csv(metadata_path, compression='gzip')
df_workflows.head(5)


In [None]:
print(f"Total number of workflows: {len(df_workflows)} ")
print (f"Unique number of workflows: {df_workflows['uid'].nunique()}")

In [None]:
import pandas as pd

# Adjust the path if necessary
metadata_path = './data-raw/Workflows-Metadata/workflows.csv.gz'
output_valid_hashes='./data/valid_workflow_hashes.parquet'
# Load the metadata dataset
df_workflows = pd.read_csv(metadata_path, compression='gzip')

# Filter only the rows where the workflow is valid
df_valid_workflows = df_workflows[df_workflows['valid_workflow'] == True] 

# Extract the unique file hashes of valid workflows
valid_workflow_hashes = df_valid_workflows['file_hash'].unique()

df_valid_hashes = pd.DataFrame(valid_workflow_hashes, columns=['file_hash'])
df_valid_hashes.to_parquet(output_valid_hashes, index=False)

# At this point, we have an array of file_hashes that correspond to valid workflows.


print(f"Number of valid workflows: {len(valid_workflow_hashes)} out of: {len(df_workflows)} workflows")
print("Example of valid workflow hashes:", valid_workflow_hashes[:5])


 We can then use `valid_workflow_hashes` in the next steps to:
 1. Load only those workflows from Workflow Files.
 2. Extract (path, value) pairs from them.
 3. Store the results.

In [None]:
import os
import pandas as pd
from ruamel.yaml import YAML
import time
import pyarrow.parquet as pq
import pyarrow as pa

# Path to the directory containing workflow files
extracted_path = './data-raw/Workflow-Files/workflows'

# Path to Parquet file containing valid workflow hashes (single column: 'file_hash')
valid_hashes_parquet = './data/valid_workflow_hashes.parquet'

# Output paths
output_file = './data/workflows_data.parquet'
log_file = 'error_log.txt'

# Batch size for file processing
batch_size = 500

# Columns for the final output table
columns = ['workflow_hash', 'path', 'value']

# ------------------------------------------------------------------------------
# 1. LOAD VALID WORKFLOW HASHES
# ------------------------------------------------------------------------------
df_valid_hashes = pd.read_parquet(valid_hashes_parquet)  # contains one column: 'file_hash'
valid_hashes = set(df_valid_hashes['file_hash'])  # convert to a set for quick membership lookup
print(f"{len(valid_hashes)} valid workflow files")
# ------------------------------------------------------------------------------
# 2. YAML PARSER INITIALIZATION
# ------------------------------------------------------------------------------
yaml = YAML(typ='safe', pure=True)
yaml.allow_duplicate_keys = False

# ------------------------------------------------------------------------------
# 3. FUNCTION TO RECURSIVELY EXTRACT (PATH, VALUE) PAIRS (Based on DFS algorithm)
# ------------------------------------------------------------------------------
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((str(new_prefix), str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((str(new_prefix), str(item)))
    return paths

# ------------------------------------------------------------------------------
# 4. FILTER AND PROCESS ONLY VALID WORKFLOW FILES
# ------------------------------------------------------------------------------
# Since filenames are exactly the workflow_hash (no extensions), we can simply filter by membership in valid_hashes.
all_files = os.listdir(extracted_path)
yaml_files = [f for f in all_files if f in valid_hashes]


total_files = len(yaml_files) 
processed_files = 0

if (total_files==0):
    print("No Valid Workflow has been found!")

# Remove previous log file if it exists
if os.path.exists(log_file):
    os.remove(log_file)

# Initialize a DataFrame to store all path/value pairs 
all_data = pd.DataFrame(columns=columns)

start_time = time.time()

# Process in batches
for i in range(0, total_files, batch_size):
    batch_files = yaml_files[i : i + batch_size]
    batch_data = []

    for file in batch_files:
        file_path = os.path.join(extracted_path, file)
        
        # In this case, 'file' is the workflow hash
        workflow_hash = file

        try:
            with open(file_path, 'r') as f:
                workflow_data = yaml.load(f)

            paths_and_values = extract_paths(workflow_data)
            
            # Add each path-value pair to the batch data
            for path, value in paths_and_values:
                batch_data.append({
                    'workflow_hash': workflow_hash,
                    'path': path,
                    'value': value
                })

        except Exception as e:
            # Log errors with details about problematic files
            with open(log_file, 'a') as log:
                log.write(f"Error processing file {file}: {str(e)}\n")
                print(f"Error processing file {file}: {str(e)}\n")
            continue

    # Convert batch data to DataFrame and write/append to Parquet
    if batch_data:
        df_batch = pd.DataFrame(batch_data, columns=columns)

        # Ensure all 'path' and 'value' columns are strings
        df_batch['path'] = df_batch['path'].astype(str)
        df_batch['value'] = df_batch['value'].astype(str)

        # Fill NaN values with empty strings (defensive measure)
        df_batch = df_batch.fillna('')
        all_data = pd.concat([all_data, df_batch])
        

    processed_files += len(batch_files)
    print(f"Processed {processed_files}/{total_files} valid workflow files...")


# ------------------------------------------------------------------------------
# 5. CONVERT all_data TO PARQUET FILE AFTER PROCESSING
# ------------------------------------------------------------------------------

all_data_table = pa.Table.from_pandas(all_data)
final_output_file = './data/workflows_data.parquet'

# Write the entire all_data DataFrame to a new Parquet file
pq.write_table(all_data_table, final_output_file)

# Check number of unique workflows in the new Parquet file
df_final_check = pd.read_parquet(final_output_file)
unique_workflows_final_check = df_final_check['workflow_hash'].nunique()
print(f"Total number of unique workflows in final Parquet: {unique_workflows_final_check}")

end_time = time.time()
elapsed_time = end_time - start_time

print("Processing completed!")
print(f"Total valid workflow files processed: {processed_files}")
print(f"Time taken: {elapsed_time:.2f} seconds")
print(f"Errors logged in: {log_file}")


In [None]:
pwd

In [None]:
all_data['workflow_hash'].nunique()

In [None]:
"""
Experimenting: getting number of distinct workflows ...

"""

import pandas as pd

# Load the Parquet file
df = pd.read_parquet('./data/workflows_data.parquet')

# Group by 'workflow_id' and count unique workflows
unique_workflows = df['workflow_hash'].nunique()

# Get total number of rows and columns
total_rows = len(df)
total_columns = len(df.columns)

# Display the information
print(f"Total number of unique workflows: {unique_workflows}")
print(f"Total number of rows: {total_rows}")
print(f"Total number of columns: {total_columns}")



In [None]:
# Reading and returning the head of data from parquet file containing our path/value pairs

df.head(20)

In [None]:
import pandas as pd

# Load the Parquet file
df = pd.read_parquet('./data/workflows_data.parquet')

# Check for duplicate workflow hashes
duplicate_hashes = df['workflow_hash'].duplicated(keep=False)
duplicates = df[duplicate_hashes]
print(f"Number of duplicate rows: {len(duplicates)}")
print(duplicates['workflow_hash'].value_counts())


In [None]:
import os
from ruamel.yaml import YAML
from pathlib import Path


    # Function to extract paths and values from YAML data
def extract_paths(data, prefix=''):
    paths = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                paths.extend(extract_paths(value, new_prefix))
            else:
                paths.append((new_prefix, str(value)))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_prefix = f"{prefix}[{index}]"
            if isinstance(item, (dict, list)):
                paths.extend(extract_paths(item, new_prefix))
            else:
                paths.append((new_prefix, str(item)))
    return paths

    

# Set the path to the workflows directory
extracted_path = '/Users/aref/Desktop/PhD/Codes/github-workflows-analysis/test_workflows/'

# Get the first YAML file in the directory
yaml_files = [f for f in os.listdir(extracted_path)]
if not yaml_files:
    print("No YAML files found in the directory.")
    exit()

first_file = yaml_files[1]
file_path = os.path.join(extracted_path, first_file)

# Parse the YAML file
yaml = YAML()
with open(file_path, 'r') as file:
    workflow_data = yaml.load(file)

# Extract paths and values
paths_and_values = extract_paths(workflow_data)

# Print the results
print(f"Paths and Values for {first_file}:")
for path, value in paths_and_values:
    print(f"{path}: {value}")
