In [None]:
import os
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt

def count_degrees(file_path):
    """
    Process a single Matrix Market edge list file and return max, min, average degree, 
    node with max degree, and node with average degree.
    """
    degree_dict = defaultdict(int)
    matrix_size_skipped = False  # Flag to skip the matrix size line

    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            # Skip comment lines and metadata (starting with '%')
            if line.startswith('%'):
                continue

            # Skip the matrix size line (first non-comment line)
            if not matrix_size_skipped:
                parts = line.split()
                if len(parts) != 3:
                    print(f"Unexpected format in size line at {file_path}:{line_number}. Expected 3 elements.")
                else:
                    pass  # Currently, we just skip the size line
                matrix_size_skipped = True
                continue  # Skip the size line and move to the next line

            parts = line.split()
            if len(parts) < 2:
                print(f"Skipping invalid line in {file_path}:{line_number}: {line}")
                continue

            try:
                # Process the actual data lines (node1, node2, and optional weight)
                u = int(parts[0])
                v = int(parts[1])

                # Update the degrees for both nodes
                degree_dict[u] += 1
                degree_dict[v] += 1
            except ValueError as e:
                print(f"Error parsing line {line_number} in {file_path}: {e}")
                continue

    if not degree_dict:
        print(f"No valid edge data found in {file_path}.")
        return None

    # Calculate max, min, and average degrees
    degrees = degree_dict.values()
    max_degree = max(degrees)
    min_degree = min(degrees)
    avg_degree = sum(degrees) / len(degrees)

    # Find one node with the maximum degree (just take the first node)
    max_degree_node = next(node for node, degree in degree_dict.items() if degree == max_degree)

    # Find one node with the average degree (rounded to nearest integer)
    avg_degree_rounded = round(avg_degree)
    avg_degree_node = next(node for node, degree in degree_dict.items() if degree == avg_degree_rounded)

    return max_degree, min_degree, avg_degree, max_degree_node, avg_degree_node


def process_single_directory(directory_path):
    """
    Process all edge list files in the specified directory and return degrees.
    """
    results = {}  # To store max, min, and avg degree for each file

    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".mtx"):  # Filter for Matrix Market files
            file_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")

            # Process each file and store the max, min, avg degrees, and node information
            degree_info = count_degrees(file_path)
            if degree_info:
                max_degree, min_degree, avg_degree, max_degree_node, avg_degree_node = degree_info
                results[filename] = {
                    "max_degree": max_degree,
                    "min_degree": min_degree,
                    "avg_degree": avg_degree,
                    "max_degree_node": max_degree_node,  # Store one node with max degree
                    "avg_degree_node": avg_degree_node   # Store one node with average degree
                }
            else:
                print(f"Skipping file {filename} due to lack of valid data.")

    return results


def write_to_csv(results, output_file):
    """
    Write the results dictionary to a CSV file.
    """
    import csv
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['filename', 'max_degree', 'min_degree', 'avg_degree', 'max_degree_node', 'avg_degree_node']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()  # Write header
        for filename, degree_info in results.items():
            writer.writerow({
                'filename': filename, 
                'max_degree': degree_info['max_degree'],
                'min_degree': degree_info['min_degree'],
                'avg_degree': degree_info['avg_degree'],
                'max_degree_node': degree_info['max_degree_node'],
                'avg_degree_node': degree_info['avg_degree_node']
            })


def process_directory(directory):
    """
    Process a single directory.
    """
    print(f"\nProcessing directory: {directory}")

    # Output CSV and PNG file names based on the directory name
    output_csv = f'degree_statistics_{os.path.basename(directory)}.csv'
    output_png = f'degree_statistics_{os.path.basename(directory)}.png'

    # Process all .mtx files in the directory
    degree_stats = process_single_directory(directory)

    if not degree_stats:
        print("No valid degree statistics to write.")
        return

    # Write the results to a CSV file
    write_to_csv(degree_stats, output_csv)

    # Create and save the bar chart
    df = pd.DataFrame(degree_stats).T
    df.index.name = 'filename'

    # Ensure that the columns are in the desired order
    df = df[['min_degree', 'max_degree', 'avg_degree']]

    fig, ax = plt.subplots(figsize=(12, 8))
    df.plot(kind='bar', ax=ax, color=['skyblue', 'salmon', 'limegreen'])

    # Adding labels and title
    ax.set_ylabel('Degree')
    ax.set_title(f'Min, Max, and Avg Degree for Each File in {os.path.basename(directory)}')
    ax.set_xlabel('File')

    # Rotate the x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')

    # Set y-axis to a logarithmic scale for better representation
    ax.set_yscale('log')

    # Adding a legend
    plt.legend(title='Degree Type')

    # Show and save the plot
    plt.tight_layout()
    plt.savefig(output_png)  # Save the plot to a PNG file
    plt.show()

    print(f"Results written to {output_csv} and plot saved to {output_png}")



# Specify the single directory you want to process
directory = '/lustre/orion/gen150/world-shared/abby-summer24/hipmcldatasets/mtxfiles'

# Process the specified directory and generate CSV and PNG files
process_directory(directory)
