# Raft Results Analysis 

In [8]:
%matplotlib notebook 

import os
import re
import pytz
import json 
import glob 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

from datetime import datetime

sns.set_context('notebook')
sns.set_style('whitegrid')

## Data Loading 

In [20]:
DATA = "../data/**/*.json"
RUNID = re.compile(r'metrics-(\d+).json')
DTFMT = "%Y-%m-%dT%H:%M:%S.%f"


def parse_hostname(host):
    """
    Returns the region and the PID of the host 
    """
    host = host.split("-")
    pid = int(host[-1])
    region = " ".join(host[1:-1])
    return pid, region 


def parse_datetime(dt):
    dt = dt[:-4]
    return pytz.utc.localize(datetime.strptime(dt, DTFMT))

def get_deploy(regions):
    if 'tokyo' in regions:
        return 'global'
    
    if 'london' in regions:
        return 'atlantic'
    
    return 'virginia'
    

def load_json_data(data=DATA, metric="server"):
    for path in glob.glob(data):
        
        runid = int(RUNID.search(path).groups()[0])
        host = os.path.basename(os.path.dirname(path))
        
        with open(path, 'r') as f:
            for line in f:
                row = json.loads(line)
                row["runid"] = runid 
                
                # Basic filters 
                if metric is not None and row["metric"] != metric:
                    continue 
                
                if row['throughput'] <= 0:
                    continue
                    
                regions = set([
                    parse_hostname(replica)[1] for replica in row['quorum']
                ])
                
                row['quorum'] = len(row['quorum'])
                row['deploy'] = get_deploy(regions)
                row["started"] = parse_datetime(row["started"])
                row["finished"] = parse_datetime(row["finished"])
                row["pid"], row["location"] = parse_hostname(row["replica"])
                
                yield row 
                

data = pd.DataFrame(load_json_data())

In [22]:
data.sort_values(by="runid").head()

Unnamed: 0,clients,commits,deploy,drops,duration,finished,location,metric,pid,quorum,replica,runid,started,throughput,version
19,1,245,atlantic,0,38.426135418s,2018-08-06 00:58:48.395710+00:00,virginia,server,80,3,alia-virginia-80,1,2018-08-06 00:58:09.969582+00:00,6.375869,0.3.3
100,2,678,atlantic,0,38.966912238s,2018-08-06 01:00:00.394025+00:00,london,server,17,3,alia-london-17,2,2018-08-06 00:59:21.427113+00:00,17.399377,0.3.3
8,3,714,atlantic,0,39.712523061s,2018-08-06 01:01:12.307293+00:00,oregon,server,100,3,alia-oregon-100,3,2018-08-06 01:00:32.594770+00:00,17.979215,0.3.3
36,4,820,atlantic,0,38.771185776s,2018-08-06 01:02:29.903340+00:00,virginia,server,80,3,alia-virginia-80,4,2018-08-06 01:01:51.132155+00:00,21.149727,0.3.3
47,5,945,atlantic,0,39.105724498s,2018-08-06 01:03:43.226109+00:00,virginia,server,80,3,alia-virginia-80,5,2018-08-06 01:03:04.120384+00:00,24.16526,0.3.3


## Throughput by Clients 

In [29]:
def plot_throughput(quorum=3, data=data, path=None):
    _, ax = plt.subplots(figsize=(9,6))
    g = sns.barplot(
        x="clients", y="throughput", hue="deploy", ax=ax,
        data=data[data['quorum'] == quorum]
    )
    
    g.set_ylabel("throughput (commits/sec)")
    g.set_xlabel("concurrent clients")
    g.set_title("Quorum Size {}".format(quorum))
    
    if path is not None:
        plt.savefig(path)
    
    return g

plot_throughput(3, path="throughput-3.pdf")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10b0e75c0>

In [30]:
plot_throughput(5, path="throughput-5.pdf")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10b776dd8>