In [18]:
import pandas as pd


In [19]:
def read_markdown_table(md_file_path):
    # Read the markdown file
    with open(md_file_path, 'r') as file:
        lines = file.readlines()
    
    # Initialize an empty list to hold rows of the table
    table_data = []
    
    # Process each line
    for line in lines:
        # Check if the line is part of the table
        if '|' in line:
            # Split the line by '|' to get individual cell values
            row = line.strip().split('|')
            # Filter out empty strings and strip whitespace from each cell
            row = [cell.strip() for cell in row if cell.strip()]
            # Add the row to the table data list
            table_data.append(row)
    
    # The first row is the header
    headers = table_data[0]
    # The rest are the data rows
    data_rows = table_data[2:]
    
    # Create a pandas DataFrame
    df = pd.DataFrame(data_rows, columns=headers)

    # replace "-" with NaN
    df = df.replace('-', pd.NA)
    
    # convert all columns containing "Time" to float
    for col in df.columns:
        if 'Time' in col:
            df[col] = df[col].str.extract(r'(\d+\.\d+)').astype(float)
            
    return df

# Example usage
md_file_path = 'README.md'
df = read_markdown_table(md_file_path)


In [20]:
# to long format, wrap columns containing Time
df = df.melt(id_vars=['Cluster', 'Dataset', "Benchmark", "Device Name", "# Devices", "Num Workers"], var_name='Measurement', value_name='Time')
print(df)

    Cluster   Dataset         Benchmark Device Name # Devices Num Workers  \
0        TN  ImageNet         Inference        V100         1           4   
1        TN  ImageNet             Train        V100         1           4   
2        TN  ImageNet  Smooth Inference        V100         1           4   
3        TN     Dummy         Inference        V100         1           4   
4        TN     Dummy             Train        V100         1           4   
..      ...       ...               ...         ...       ...         ...   
115   JADE2  ImageNet             Train        V100         1           4   
116   JADE2  ImageNet  Smooth Inference        V100         1           4   
117   JADE2     Dummy         Inference        V100         1           4   
118   JADE2     Dummy             Train        V100         1           4   
119   JADE2     Dummy  Smooth Inference        V100         1           4   

           Measurement    Time  
0    Data Loading Time  0.1022  
1    Data

In [21]:
# get mean for benchmark, cluster, device and Measurement
mean_times = df.groupby(['Cluster', "Benchmark", "Device Name", "Measurement"])["Time"].mean().reset_index()

# pivot to wide format. keep rows: Benchmark, Device. columns: Cluster and Measurement
mean_times = mean_times.pivot_table(index=["Benchmark", "Device Name"], columns=["Measurement", "Cluster"], values="Time").reset_index()
mean_times

Measurement,Benchmark,Device Name,Backward Time,Backward Time,Data Loading Time,Data Loading Time,Data Processing Time,Data Processing Time,Forward Time,Forward Time
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,JADE2,TN,JADE2,TN,JADE2,TN,JADE2,TN
0,Inference,A40,,,,0.05305,,,,0.112225
1,Inference,V100,,,2.6632,0.06695,,,0.60375,0.1122
2,Smooth Inference,A40,,,,0.185475,,0.0016,,0.005975
3,Smooth Inference,V100,,,0.1657,0.10115,0.0255,0.0009,0.1558,0.00805
4,Train,A40,,0.20355,,0.0339,,,,0.1273
5,Train,V100,0.517025,0.23775,2.245625,0.0197,,,0.706675,0.12295


In [22]:
# add column JADE2Exp = is the expected JADE2 V100 performance given the V100 performance on TN and that it only has 2/3 of a normal V100.
jade_v100_factor = 0.67
# add columns for JADE2Exp = TN*jade_v100_factor for each measurement: Backward Time, Data Processing Time, Forward Time

for measure in ["Backward Time", "Data Processing Time", "Forward Time"]:
    mean_times[measure, "JADE2Exp"] = mean_times[measure, "TN"] / jade_v100_factor

# # sort columns to group first level column name
new_column_order = mean_times.columns[:2]
new_column_order = new_column_order.append(mean_times.columns[mean_times.columns.get_level_values(0) == "Data Loading Time"])
new_column_order = new_column_order.append(mean_times.columns[mean_times.columns.get_level_values(0) == "Data Processing Time"])
new_column_order = new_column_order.append(mean_times.columns[mean_times.columns.get_level_values(0) == "Forward Time"])
new_column_order = new_column_order.append(mean_times.columns[mean_times.columns.get_level_values(0) == "Backward Time"])
mean_times = mean_times[new_column_order]
mean_times

Measurement,Benchmark,Device Name,Data Loading Time,Data Loading Time,Data Processing Time,Data Processing Time,Data Processing Time,Forward Time,Forward Time,Forward Time,Backward Time,Backward Time,Backward Time
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,JADE2,TN,JADE2,TN,JADE2Exp,JADE2,TN,JADE2Exp,JADE2,TN,JADE2Exp
0,Inference,A40,,0.05305,,,,,0.112225,0.1675,,,
1,Inference,V100,2.6632,0.06695,,,,0.60375,0.1122,0.167463,,,
2,Smooth Inference,A40,,0.185475,,0.0016,0.002388,,0.005975,0.008918,,,
3,Smooth Inference,V100,0.1657,0.10115,0.0255,0.0009,0.001343,0.1558,0.00805,0.012015,,,
4,Train,A40,,0.0339,,,,,0.1273,0.19,,0.20355,0.303806
5,Train,V100,2.245625,0.0197,,,,0.706675,0.12295,0.183507,0.517025,0.23775,0.354851


In [23]:
mean_times = mean_times.loc[mean_times["Device Name"] == "V100"]
# for each benchmark compare JADE2 vs JADE2Exp
for measure in ["Backward Time", "Data Processing Time", "Forward Time"]:
    mean_times[measure, "JADE2vsJADE2Exp"] = mean_times[measure, "JADE2"] / mean_times[measure, "JADE2Exp"]

# keep only first columns and last
mean_times = mean_times.iloc[:, [0, 1, -3, -2, -1]]

mean_times

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mean_times[measure, "JADE2vsJADE2Exp"] = mean_times[measure, "JADE2"] / mean_times[measure, "JADE2Exp"]


Measurement,Benchmark,Device Name,Backward Time,Data Processing Time,Forward Time
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,JADE2vsJADE2Exp,JADE2vsJADE2Exp,JADE2vsJADE2Exp
1,Inference,V100,,,3.605281
3,Smooth Inference,V100,,18.983333,12.967205
5,Train,V100,1.457021,,3.850933
