In [1]:
import os
from collections import defaultdict
import pandas as pd

def count_degrees(file_path):
    """
    Process a single Matrix Market edge list file and return a dictionary of exact degree counts.
    """
    degree_dict = defaultdict(int)
    matrix_size_skipped = False  # Flag to skip the matrix size line

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('%') or not line:
                continue  # Skip comments and empty lines

            # Skip the first non-comment line with matrix size information
            if not matrix_size_skipped:
                matrix_size_skipped = True
                continue

            # Update the degrees for both nodes
            u, v = map(int, line.split()[:2])
            degree_dict[u] += 1
            degree_dict[v] += 1

    # Count how many nodes have each specific degree
    degree_counts = defaultdict(int)
    for degree in degree_dict.values():
        degree_counts[degree] += 1

    # Ensure degrees with zero nodes are counted (e.g., 0-degree nodes)
    max_degree = max(degree_counts.keys()) if degree_counts else 0
    for i in range(max_degree + 1):
        if i not in degree_counts:
            degree_counts[i] = 0

    return degree_counts

def process_main_directory_to_csv(main_directory, output_csv):
    """
    Process each subdirectory within the main directory and save exact degree distributions to a CSV.
    Each subdirectory represents a domain.
    """
    results = []

    # Loop through each subdirectory (domain)
    for domain_name in os.listdir(main_directory):
        domain_path = os.path.join(main_directory, domain_name)
        if not os.path.isdir(domain_path):
            continue  # Skip non-directory files

        # Process each file in the subdirectory
        for filename in os.listdir(domain_path):
            if filename.endswith(".mtx"):
                file_path = os.path.join(domain_path, filename)
                print(f"Processing {filename} in domain {domain_name}...")

                degree_counts = count_degrees(file_path)

                for degree, count in sorted(degree_counts.items()):
                    results.append({
                        'filename': filename,
                        'domain': domain_name,
                        'degree': degree,
                        'count': count
                    })

    # Write results to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Exact degree distribution data saved to {output_csv}")

# Run the function on the main directory containing all domain subdirectories
main_directory = '/lustre/orion/gen150/world-shared/abby-summer24/nawsdatasets/degrees'  # Replace with the path to your main directory
output_csv = 'exact_degree_distributions.csv'
process_main_directory_to_csv(main_directory, output_csv)


Processing webbase-1M.mtx in domain Web...
Processing uk-2002.mtx in domain Web...


KeyboardInterrupt: 