In [47]:
import os
import pandas as pd  # Import pandas for table representation

# Define the subfolders
subfolders = ['parallel_mpi/logs'
              #, 'parallel_openmpi/logs', 'serial_mpi/logs'
              ]

# Initialize a dictionary to store logs from each folder
logs = {}

# Initialize a list to store extracted data
extracted_data = []

# Iterate through each subfolder and read log files
for folder in subfolders:
    logs[folder] = []
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('output.log'):  # Check if the file is a log file
                file_path = os.path.join(root, file)
                
                with open(file_path, 'r') as f:
                    log_content = f.read()  # Read the entire content of the log file
                    logs[folder].append(log_content)
                
                # Extract matrix size and core number from the file name
                tokens = file.split('_')
                if len(tokens) >= 5:
                    matrix_size = tokens[3]  # Fourth part is matrix size
                    core_number = tokens[4]  # Fifth part is core number
                    extracted_data.append({
                        'Folder': folder,
                        'Log File': file,
                        'Matrix Size': matrix_size,
                        'Core Number': core_number,
                        'Log': log_content  # Add the log content as the last column
                    })

# Convert the extracted data into a DataFrame
extracted_data_df_mpi = pd.DataFrame(extracted_data)

# Display the DataFrame
extracted_data_df_mpi

Unnamed: 0,Folder,Log File,Matrix Size,Core Number,Log
0,parallel_mpi/logs,matmul_parallel_mpi_1024_16_output.log,1024,16,MPI 1024x1024 matrix multiplication took 0.338...
1,parallel_mpi/logs,matmul_parallel_mpi_1024_1_output.log,1024,1,MPI 1024x1024 matrix multiplication took 3.302...
2,parallel_mpi/logs,matmul_parallel_mpi_1024_2_output.log,1024,2,MPI 1024x1024 matrix multiplication took 1.749...
3,parallel_mpi/logs,matmul_parallel_mpi_1024_32_output.log,1024,32,MPI 1024x1024 matrix multiplication took 0.555...
4,parallel_mpi/logs,matmul_parallel_mpi_1024_4_output.log,1024,4,MPI 1024x1024 matrix multiplication took 0.872...
5,parallel_mpi/logs,matmul_parallel_mpi_1024_8_output.log,1024,8,MPI 1024x1024 matrix multiplication took 0.470...
6,parallel_mpi/logs,matmul_parallel_mpi_16384_16_output.log,16384,16,\n============================================...
7,parallel_mpi/logs,matmul_parallel_mpi_16384_1_output.log,16384,1,
8,parallel_mpi/logs,matmul_parallel_mpi_16384_2_output.log,16384,2,
9,parallel_mpi/logs,matmul_parallel_mpi_16384_32_output.log,16384,32,\n============================================...


In [None]:
import os
import pandas as pd  # Import pandas for table representation

# Define the subfolders
subfolders = [#'parallel_mpi/logs'
              'parallel_openmpi/logs' 
              #'serial_mpi/logs'
              ]

# Initialize a dictionary to store logs from each folder
logs = {}

# Initialize a list to store extracted data
extracted_data = []

# Iterate through each subfolder and read log files
for folder in subfolders:
    logs[folder] = []
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('output.log'):  # Check if the file is a log file
                file_path = os.path.join(root, file)
                
                with open(file_path, 'r') as f:
                    log_content = f.read()  # Read the entire content of the log file
                    logs[folder].append(log_content)
                
                # Extract matrix size and core number from the file name
                tokens = file.split('_')
                if len(tokens) >= 5:
                    matrix_size = tokens[2]  # Fourth part is matrix size
                    core_number = tokens[3]  # Fifth part is core number
                    extracted_data.append({
                        'Folder': folder,
                        'Log File': file,
                        'Matrix Size': matrix_size,
                        'Core Number': core_number,
                        'Log': log_content  # Add the log content as the last column
                    })

# Convert the extracted data into a DataFrame
extracted_data_df_openmpi = pd.DataFrame(extracted_data)

# Display the DataFrame
extracted_data_df_openmpi

Unnamed: 0,Folder,Log File,Matrix Size,Core Number,Log
0,parallel_openmpi/logs,matmul_openmpi_1024_16_output.log,16,output.log,OpenMP outer-product matmul: N = 1024 (allocat...
1,parallel_openmpi/logs,matmul_openmpi_1024_2_output.log,2,output.log,OpenMP outer-product matmul: N = 1024 (allocat...
2,parallel_openmpi/logs,matmul_openmpi_1024_32_output.log,32,output.log,OpenMP outer-product matmul: N = 1024 (allocat...
3,parallel_openmpi/logs,matmul_openmpi_1024_4_output.log,4,output.log,OpenMP outer-product matmul: N = 1024 (allocat...
4,parallel_openmpi/logs,matmul_openmpi_1024_8_output.log,8,output.log,OpenMP outer-product matmul: N = 1024 (allocat...
5,parallel_openmpi/logs,matmul_openmpi_2048_16_output.log,16,output.log,
6,parallel_openmpi/logs,matmul_openmpi_2048_1_output.log,1,output.log,OpenMP outer-product matmul: N = 2048 (allocat...
7,parallel_openmpi/logs,matmul_openmpi_2048_2_output.log,2,output.log,
8,parallel_openmpi/logs,matmul_openmpi_2048_32_output.log,32,output.log,
9,parallel_openmpi/logs,matmul_openmpi_2048_8_output.log,8,output.log,OpenMP outer-product matmul: N = 2048 (allocat...


In [46]:
import re

# Extract seconds from the Log column
extracted_data_df['Seconds'] = extracted_data_df['Log'].apply(lambda log: float(re.search(r'took ([\d.]+) seconds', log).group(1)) if re.search(r'took ([\d.]+) seconds', log) else None)

# Display the updated DataFrame
extracted_data_df

Unnamed: 0,Folder,Log File,Matrix Size,Core Number,Log,Seconds
0,parallel_mpi/logs,matmul_parallel_mpi_1024_16_output.log,1024,16,MPI 1024x1024 matrix multiplication took 0.338...,0.338842
1,parallel_mpi/logs,matmul_parallel_mpi_1024_1_output.log,1024,1,MPI 1024x1024 matrix multiplication took 3.302...,3.30227
2,parallel_mpi/logs,matmul_parallel_mpi_1024_2_output.log,1024,2,MPI 1024x1024 matrix multiplication took 1.749...,1.74987
3,parallel_mpi/logs,matmul_parallel_mpi_1024_32_output.log,1024,32,MPI 1024x1024 matrix multiplication took 0.555...,0.555122
4,parallel_mpi/logs,matmul_parallel_mpi_1024_4_output.log,1024,4,MPI 1024x1024 matrix multiplication took 0.872...,0.872999
5,parallel_mpi/logs,matmul_parallel_mpi_1024_8_output.log,1024,8,MPI 1024x1024 matrix multiplication took 0.470...,0.47028
6,parallel_mpi/logs,matmul_parallel_mpi_16384_16_output.log,16384,16,\n============================================...,
7,parallel_mpi/logs,matmul_parallel_mpi_16384_1_output.log,16384,1,,
8,parallel_mpi/logs,matmul_parallel_mpi_16384_2_output.log,16384,2,,
9,parallel_mpi/logs,matmul_parallel_mpi_16384_32_output.log,16384,32,\n============================================...,


In [None]:
import re

# Extract seconds from the Log column
structured_data_df['Seconds'] = structured_data_df['Log'].apply(lambda log: float(re.search(r'took ([\d.]+) seconds', log).group(1)) if re.search(r'took ([\d.]+) seconds', log) else None)

# Display the updated DataFrame
structured_data_df

NameError: name 'structured_extracted_df' is not defined

In [28]:
logs

{'parallel_mpi/logs': [['MPI 1024x1024 matrix multiplication took 0.338842 seconds (max across ranks)\n'],
  ['MPI 1024x1024 matrix multiplication took 3.302270 seconds (max across ranks)\n'],
  ['MPI 1024x1024 matrix multiplication took 1.749870 seconds (max across ranks)\n'],
  ['MPI 1024x1024 matrix multiplication took 0.555122 seconds (max across ranks)\n'],
  ['MPI 1024x1024 matrix multiplication took 0.872999 seconds (max across ranks)\n'],
  ['MPI 1024x1024 matrix multiplication took 0.470280 seconds (max across ranks)\n'],
  ['\n',
   '=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES\n',
   '=   PID 24248 RUNNING AT hpc-c11-node08.unitn.it\n',
   '=   EXIT CODE: 139\n',
   '=   CLEANING UP REMAINING PROCESSES\n',
   '=   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES\n',
   'YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)\n',
   'This typically refers to a problem with your application.\n',
   'Please see the FAQ page for debugging suggestion

In [30]:
output_logs

[{'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 1024x1024 matrix multiplication took 0.338842 seconds (max across ranks)'},
 {'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 1024x1024 matrix multiplication took 3.302270 seconds (max across ranks)'},
 {'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 1024x1024 matrix multiplication took 1.749870 seconds (max across ranks)'},
 {'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 1024x1024 matrix multiplication took 0.555122 seconds (max across ranks)'},
 {'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 1024x1024 matrix multiplication took 0.872999 seconds (max across ranks)'},
 {'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 1024x1024 matrix multiplication took 0.470280 seconds (max across ranks)'},
 {'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 2048x2048 matrix multiplication took 6.729713 seconds (max across ranks)'},
 {'Folder': 'parallel_mpi/logs',
  'Log Content': 'MPI 2048x2048 matrix multiplicat

In [33]:
# Extract structured data from logs
structured_logs = []
for log in output_logs:  # Iterate over output_logs instead of logs
    log_content = log['Log Content']
    folder = log['Folder']
    tokens = log_content.split()
    
    # Extract core count from the file name
    file_name = log.get('File Name', None)  # Use .get() to avoid KeyError
    core_count = None
    if file_name:
        file_name_parts = file_name.split('_')  # Split the file name by underscores
        core_count = file_name_parts[3] if len(file_name_parts) > 3 else None  # Extract the fourth part
    
    if len(tokens) >= 5 and tokens[0] == 'MPI':  # Ensure the log has the expected format
        log_type = tokens[0]  # First token
        matrix_size = tokens[1]  # Second token
        structured_logs.append({
            'Folder': folder,
            'Type': log_type,
            'Matrix Size': matrix_size,
            'Number of Cores': core_count,  # Use core count from file name
            'Log': log_content
        })

# Create a DataFrame from the structured logs
structured_logs_df = pd.DataFrame(structured_logs)
structured_logs_df

Unnamed: 0,Folder,Type,Matrix Size,Number of Cores,Log
0,parallel_mpi/logs,MPI,1024x1024,,MPI 1024x1024 matrix multiplication took 0.338...
1,parallel_mpi/logs,MPI,1024x1024,,MPI 1024x1024 matrix multiplication took 3.302...
2,parallel_mpi/logs,MPI,1024x1024,,MPI 1024x1024 matrix multiplication took 1.749...
3,parallel_mpi/logs,MPI,1024x1024,,MPI 1024x1024 matrix multiplication took 0.555...
4,parallel_mpi/logs,MPI,1024x1024,,MPI 1024x1024 matrix multiplication took 0.872...
5,parallel_mpi/logs,MPI,1024x1024,,MPI 1024x1024 matrix multiplication took 0.470...
6,parallel_mpi/logs,MPI,2048x2048,,MPI 2048x2048 matrix multiplication took 6.729...
7,parallel_mpi/logs,MPI,2048x2048,,MPI 2048x2048 matrix multiplication took 41.40...
8,parallel_mpi/logs,MPI,2048x2048,,MPI 2048x2048 matrix multiplication took 26.47...
9,parallel_mpi/logs,MPI,2048x2048,,MPI 2048x2048 matrix multiplication took 3.515...
