In [8]:
import sys
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

datasets = ["road_asia"] #, "orkut", "road_usa", "livejournal", "dota_league", "graph500_23", "graph500_26", "graph500_28", "graph500_30"]
RESULTS_DIR = "/Users/Puneet89/scratch/flexograph-eval/results"
DATASET_DIR = "/Users/Puneet89/scratch/flexograph-eval/datasets"

#we now need to construct a dataframe with the following columns:
#dataset, benchmark, system, preprocessing_time,exec_time

df_results = pd.DataFrame(columns=['dataset', 'benchmark', 'system', 'preprocessing_time', 'exec_time'])

## GAPBS
All the datasets are basically just dumps of the output from running the commands and we must first parse the output.

In [9]:
SYSTEM="gapbs"
benchmarks = ["pr", "bfs", "cc", "sssp"]
for dataset in datasets:
  for benchmark in benchmarks:
    #check if the file exists
    if os.path.exists(f"{RESULTS_DIR}/{SYSTEM}/{dataset}_{benchmark}.txt"):
      #open the file and read the contents
      with open(f"{RESULTS_DIR}/{SYSTEM}/{dataset}_{benchmark}.txt" , 'r') as f:
        regex = r"^(Read|Build|Trial)\sTime:\s+(\d+\.\d+)"
        input = f.read()
        # Search for all occurrences of the regex pattern
        matches = re.finditer(regex, input, re.MULTILINE)

        # Print the matches
        read_times = [] 
        build_times = []
        trial_times = []
        for match in matches:
            #we want to print the sum of times if the first element of the tuple is 'Read' or 'Build'
            if match.group(1) == 'Read':
                read_times.append(float(match.group(2)))
            elif match.group(1) == 'Build':
                build_times.append(float(match.group(2)))
            else:
                trial_times.append(float(match.group(2)))
        preprocessing_time = round(sum(read_times)/len(read_times) + sum(build_times)/len(build_times),5)
        exec_time = round(sum(trial_times)/len(trial_times),5)
        new_tuple = (f"{dataset}", f"{benchmark}", f"{SYSTEM}", preprocessing_time,exec_time)
        # Convert the tuple into a DataFrame with the same column names
        new_row = pd.DataFrame([new_tuple], columns=df_results.columns)
        # Use pd.concat to append the new row
        df_results = pd.concat([df_results, new_row], ignore_index=True)

print(df_results)



     dataset benchmark system  preprocessing_time  exec_time
0  road_asia        pr  gapbs             1.38748    0.16379
1  road_asia       bfs  gapbs             1.37387    0.01518
2  road_asia        cc  gapbs             1.46481    0.01272
3  road_asia      sssp  gapbs             1.47599    0.05406


  df_results = pd.concat([df_results, new_row], ignore_index=True)


## Gemini

In [3]:
#we first need to find the preprocessing time by reading the file datasets/{dataset}/{dataset}_gemini_convert.log

SYSTEM="gemini"

benchmarks = ["pagerank", "bfs", "cc", "sssp"]
for dataset in datasets:
  preprocessing_time = 0
  with open(f"{DATASET_DIR}/{dataset}/{dataset}_gemini_convert.log" , 'r') as f:
    regex = r"^time:\s+(\d+\.\d+)"
    input = f.read()
    # Search for all occurrences of the regex pattern
    matches = re.finditer(regex, input, re.MULTILINE)
    for match in matches:
      preprocessing_time = float(match.group(1))
      print(preprocessing_time)
  for benchmark in benchmarks:
    df = pd.read_csv(f"{RESULTS_DIR}/{SYSTEM}/{dataset}_{benchmark}.csv", sep=",",names=["dataset_name", "benchmark_name", "cores", "threads", "time(s)"],header=0)
    print(df.head())
    exec_time = round(df["time(s)"].mean(),5)
    new_tuple = (f"{dataset}", f"{benchmark}", f"{SYSTEM}", preprocessing_time,exec_time)
    new_row = pd.DataFrame([new_tuple], columns=df_results.columns)
    df_results = pd.concat([df_results, new_row], ignore_index=True)


13.848687
  dataset_name benchmark_name  cores  threads   time(s)
0    road_asia       pagerank    192        2  0.768005
1    road_asia       pagerank    192        2  0.530857
2    road_asia       pagerank    192        2  0.540689
3    road_asia       pagerank    192        2  0.533989
4    road_asia       pagerank    192        2  0.544544
  dataset_name benchmark_name  cores  threads   time(s)
0    road_asia            bfs    192        2  0.004874
1    road_asia            bfs    192        2  0.002845
2    road_asia            bfs    192        2  0.002986
3    road_asia            bfs    192        2  0.002948
4    road_asia            bfs    192        2  0.006239
  dataset_name benchmark_name  cores  threads   time(s)
0    road_asia             cc    192        2  0.346421
1    road_asia             cc    192        2  0.076548
2    road_asia             cc    192        2  0.074080
3    road_asia             cc    192        2  0.073958
4    road_asia             cc    192  

In [54]:
print(df_results.head())

     dataset benchmark  system  preprocessing_time  exec_time
0  road_asia  pagerank  gemini           13.848687    0.58191
1  road_asia       bfs  gemini           13.848687    0.00490
2  road_asia        cc  gemini           13.848687    0.12075
3  road_asia      sssp  gemini           13.848687    0.00388


## Ligra

In [5]:
#we first need to find the preprocessing time by reading the file datasets/{dataset}/{dataset}_gemini_convert.log

SYSTEM="ligra"

benchmarks = ["PageRank", "BFS", "Components", "Triangle"]
def change_benchmark_name(benchmark):
    if(benchmark == "PageRank"):
      benchmark = "pr"
    elif(benchmark == "BFS"):
      benchmark = "bfs"
    elif(benchmark == "Components"):
      benchmark = "cc"
    elif(benchmark == "Triangle"):
      benchmark = "tc"
    return benchmark

for dataset in datasets:
  for benchmark in benchmarks:
    preprocessing_time = 0
    exec_times=[]
    with open(f"{RESULTS_DIR}/{SYSTEM}/{dataset}_{benchmark}.txt" , 'r') as f:
      regex_preprocessing = r"^Time[\s+\w+]*:\s+(\d+.\d+)\s+seconds"
      regex_exectime=r"^Running time :\s+(\d+.\d+)"
      input = f.read()
      # Search for all occurrences of the regex pattern
      matches = re.finditer(regex_preprocessing, input, re.MULTILINE)
      for match in matches:
        preprocessing_time = float(match.group(1))
      matches = re.finditer(regex_exectime, input, re.MULTILINE)
      for match in matches:
        exec_times.append(float(match.group(1)))
      exec_time = round(sum(exec_times)/len(exec_times),5)
      benchmark = change_benchmark_name(benchmark)
      new_tuple = (f"{dataset}", f"{benchmark}", f"{SYSTEM}", preprocessing_time,exec_time)
      new_row = pd.DataFrame([new_tuple], columns=df_results.columns)
      df_results = pd.concat([df_results, new_row], ignore_index=True)
  
print(df_results.head())

     dataset benchmark  system  preprocessing_time  exec_time
0  road_asia        pr   gapbs            1.387480    0.16379
1  road_asia       bfs   gapbs            1.373870    0.01518
2  road_asia        cc   gapbs            1.464810    0.01272
3  road_asia      sssp   gapbs            1.475990    0.05406
4  road_asia  pagerank  gemini           13.848687    0.58852


## Galois 
I think we should only plot the results for the algorithms they say work best in the README


In [None]:
SYSTEM="galois"
