In [None]:
import os
from collections import defaultdict
import csv

def count_degree_bins(file_path):
    """
    Process a single Matrix Market edge list file and return degree counts for specified bins.
    """
    degree_dict = defaultdict(int)
    matrix_size_skipped = False  # Flag to skip the matrix size line

    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            # Skip comment lines and metadata (starting with '%')
            if line.startswith('%'):
                continue

            # Skip the matrix size line (first non-comment line)
            if not matrix_size_skipped:
                matrix_size_skipped = True
                continue  # Skip the size line and move to the next line

            parts = line.split()
            if len(parts) < 2:
                continue

            try:
                # Process the actual data lines (node1, node2)
                u = int(parts[0])
                v = int(parts[1])

                # Update the degrees for both nodes
                degree_dict[u] += 1
                degree_dict[v] += 1
            except ValueError as e:
                print(f"Error parsing line {line_number} in {file_path}: {e}")
                continue

    if not degree_dict:
        print(f"No valid edge data found in {file_path}.")
        return None

    # Initialize bins
    degree_bins = {
        '0': 0,
        '1': 0,
        '2-10': 0,
        '11-100': 0,
        '101-1000': 0,
        '1001-10000': 0,
        '10001+': 0
    }

    # Classify degree counts into bins
    for degree in degree_dict.values():
        if degree == 0:
            degree_bins['0'] += 1
        elif degree == 1:
            degree_bins['1'] += 1
        elif 2 <= degree <= 10:
            degree_bins['2-10'] += 1
        elif 11 <= degree <= 100:
            degree_bins['11-100'] += 1
        elif 101 <= degree <= 1000:
            degree_bins['101-1000'] += 1
        elif 1001 <= degree <= 10000:
            degree_bins['1001-10000'] += 1
        else:
            degree_bins['10001+'] += 1

    return degree_bins

def process_main_directory(main_directory, output_csv):
    """
    Process each subdirectory within the main directory and save the results to a single CSV.
    Each subdirectory represents a domain.
    """
    results = []

    # Loop through each subdirectory (domain)
    for domain_name in os.listdir(main_directory):
        domain_path = os.path.join(main_directory, domain_name)
        if not os.path.isdir(domain_path):
            continue  # Skip non-directory files

        # Process each file in the subdirectory
        for filename in os.listdir(domain_path):
            if filename.endswith(".mtx"):
                file_path = os.path.join(domain_path, filename)
                print(f"Processing {filename} in domain {domain_name}...")

                degree_bins = count_degree_bins(file_path)
                if degree_bins:
                    results.append({
                        'filename': filename,
                        'domain': domain_name,
                        **degree_bins
                    })
                else:
                    print(f"Skipping file {filename} due to lack of valid data.")

    # Write results to CSV
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['filename', 'domain', '0', '1', '2-10', '11-100', '101-1000', '1001-10000', '10001+']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)

    print(f"Degree bin statistics for all domains saved to {output_csv}")

# Specify the main directory containing domain subdirectories
main_directory = '/lustre/orion/gen150/world-shared/abby-summer24/nawsdatasets/degrees'  # Replace with the path to your main directory
output_csv = 'all_degree_bin_statistics.csv'
process_main_directory(main_directory, output_csv)


Processing webbase-2001.mtx in domain Web...
