In [1]:
import pandas as pd


In [6]:
def read_markdown_table(md_file_path):
    # Read the markdown file
    with open(md_file_path, 'r') as file:
        lines = file.readlines()
    
    # Initialize an empty list to hold rows of the table
    table_data = []
    
    # Process each line
    for line in lines:
        # Check if the line is part of the table
        if '|' in line:
            # Split the line by '|' to get individual cell values
            row = line.strip().split('|')
            # Filter out empty strings and strip whitespace from each cell
            row = [cell.strip() for cell in row if cell.strip()]
            # Add the row to the table data list
            table_data.append(row)
    
    # The first row is the header
    headers = table_data[0]
    # The rest are the data rows
    data_rows = table_data[2:]
    
    # Create a pandas DataFrame
    df = pd.DataFrame(data_rows, columns=headers)

    # replace "-" with NaN
    df = df.replace('-', pd.NA)
    
    # convert all columns containing "Time" to float
    for col in df.columns:
        if 'Time' in col:
            df[col] = df[col].str.extract(r'(\d+\.\d+)').astype(float)
            
    return df

# Example usage
md_file_path = 'README.md'
df = read_markdown_table(md_file_path)


In [7]:
# to long format, wrap columns containing Time
df = df.melt(id_vars=['Cluster', 'Dataset', "Benchmark", "Device Name", "# Devices", "Num Workers"], var_name='Measurement', value_name='Time')
print(df)

   Cluster   Dataset         Benchmark Device Name # Devices Num Workers  \
0       TN  ImageNet         Inference         A40         1           4   
1       TN  ImageNet             Train         A40         1           4   
2       TN  ImageNet  Smooth Inference         A40         1           4   
3       TN     Dummy         Inference         A40         1           4   
4       TN     Dummy             Train         A40         1           4   
..     ...       ...               ...         ...       ...         ...   
91   JADE2  ImageNet             Train        V100         1           4   
92   JADE2  ImageNet  Smooth Inference        V100         1           4   
93   JADE2     Dummy         Inference        V100         1           4   
94   JADE2     Dummy             Train        V100         1           4   
95   JADE2     Dummy  Smooth Inference        V100         1           4   

          Measurement    Time  
0   Data Loading Time  0.0403  
1   Data Loading Time  

In [8]:
# get mean for benchmark, cluster, device and Measurement
mean_times = df.groupby(['Cluster', "Benchmark", "Device Name", "Measurement"])["Time"].mean().reset_index()

# pivot to wide format. keep rows: Benchmark, Device. columns: Cluster and Measurement
mean_times = mean_times.pivot_table(index=["Benchmark", "Device Name"], columns=["Measurement", "Cluster"], values="Time").reset_index()
mean_times


Measurement,Benchmark,Device Name,Backward Time,Backward Time,Data Loading Time,Data Loading Time,Data Processing Time,Data Processing Time,Forward Time,Forward Time
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,JADE2,TN,JADE2,TN,JADE2,TN,JADE2,TN
0,Inference,A40,,,,0.05305,,,,0.112225
1,Inference,V100,,,2.6632,,,,0.60375,
2,Smooth Inference,A40,,,,0.185475,,0.0016,,0.005975
3,Smooth Inference,V100,,,0.1657,,0.0255,,0.1558,
4,Train,A40,,0.20355,,0.0339,,,,0.1273
5,Train,V100,0.517025,,2.245625,,,,0.706675,
