In [1]:
import pandas as pd
import numpy as np
from path import Path  # pip install --user path.py
import re
from IPython.display import display
from pprint import pprint
import netCDF4
from IPython.core.debugger import Pdb
from collections import namedtuple
from random import randint, choice

## Work with topology

In [2]:
def build_extended_node_pairs(node_pairs):
    inversed_node_pairs = node_pairs.rename(columns={"node1": "node2", "node2": "node1"})
    return pd.concat(
        [node_pairs, inversed_node_pairs],
        ignore_index = True, verify_integrity=True
    ) \
        .drop_duplicates(subset=["node1", "node2"]) \
        .set_index(["node1", "node2"], verify_integrity=True)

In [3]:
node_pairs = pd.read_pickle("paths_with_classes.pkl").drop("shortest_path", axis=1)

In [4]:
extended_node_pairs = build_extended_node_pairs(node_pairs)

## This program can only work with data results with following properties

In [5]:
MSG_LEN_START = 0
MSG_LEN_END = 10000  # last message length should be 9900
MSG_LEN_STEP = 100
STEPS_COUNT = (MSG_LEN_END - MSG_LEN_START) // MSG_LEN_STEP - 1
LENGTHS = range(MSG_LEN_START, MSG_LEN_END, MSG_LEN_STEP)

assert MSG_LEN_START + (STEPS_COUNT + 1) * MSG_LEN_STEP == MSG_LEN_END

## Code for loading benchmark results

In [6]:
TestResults = namedtuple("TestResults", ["hostnames", "medians"])

In [7]:
def matrix_to_table(matrix):
    table = matrix.stack().reset_index()
    table.columns = ["node1", "node2", "ping"]
    return table

In [8]:
def read_benchmark_hostnames(path_to_file):
    lines = path_to_file.lines()
    return (re.match(r"^(n\d{5})\.", line).groups()[0] for line in lines)

In [9]:
def import_data(directory):
    hostnames = tuple(read_benchmark_hostnames(directory.joinpath("network_hosts.txt")))
    with netCDF4.Dataset(directory.joinpath("network_median.nc"), "r")  as dataset:
        step_len = dataset["step_length"][0]
        start_len = dataset["begin_mes_length"][0]
        end_len = dataset["end_mes_length"][0]
        
        assert len(hostnames) == dataset["proc_num"][0]
        assert dataset["test_type"][0] == 1  # one-to-one
        assert start_len == MSG_LEN_START
        assert end_len == MSG_LEN_END  
        assert step_len == MSG_LEN_STEP
        
        # build matrices where columns represent node1, rows represent node2,
        # cells contain ping values
        matrices = (
            pd.DataFrame(dataset["data"][index], index=hostnames, columns=hostnames)
            for index in range(STEPS_COUNT + 1)
        )

        # convert matrices to tables
        tables = {
            length: matrix_to_table(matrix)
                  for (length, matrix) in zip(LENGTHS, matrices)
        }
    return TestResults(hostnames=hostnames, medians=tables)

In [10]:
test_results = import_data(Path("/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results/2017-02-12__118_nodes/"))

## Work with benchmark results

In [41]:
class Predictor():
    """Predicts ping for a packet with specific message_size between 2 nodes,
    measured in seconds."""
    def __init__(self, extended_node_pairs, test_results):
        # build tables with 2 columns each: class_, ping. There will be many rows with same class_
        pings_classes = (
            test_results.medians[msg_len] \
                .join(extended_node_pairs, on=["node1", "node2"], how="left") \
                .reset_index(drop=True)
            for msg_len in LENGTHS
        )
        
        # reverse lookup table (by message length and class)
        self._data = pd.concat(
            {
                msg_len: df.groupby("class_").mean()
                for (msg_len, df) in zip(LENGTHS, pings_classes)
            },
            names=["msg_len", "class_"]
        ).rename(columns={"ping": "mean_of_medians_ping"})
        
        self._extended_node_pairs = extended_node_pairs
    
    def _get_class(self, node1, node2):
        return self._extended_node_pairs.loc[node1, node2]["class_"]
    
    def predict_many(self, df):
        """df must have columns: msg_len, node1, node2.
        Returns table with rows in the same order,
        all other columns dropped and column of ping predictions appended."""
        return df \
            .join(self._extended_node_pairs, on=["node1", "node2"], how="left") \
            .join(self._data, on=["msg_len", "class_"], how="left") \
            .rename(columns={"mean_of_medians_ping": "predicted_ping"}) \
            [["node1", "node2", "predicted_ping"]]
    
    def predict(self, msg_len, node1, node2):
        """This function takes about 1ms"""
        return self._data.loc[msg_len, self._get_class(node1, node2)].iloc[0]

In [42]:
predictor = Predictor(extended_node_pairs, test_results)

In [68]:
assert predictor.predict(4000, "n48003", "n48009") == 5.0074094301694399e-06

In [None]:
%%timeit -r1
predictor.predict(
    choice(test_results.msg_lengths),
    *node_pairs.iloc[randint(0, len(node_pairs) - 1)][["node1", "node2"]]
)