In [36]:
import pandas as pd
import numpy as np
from path import Path  # pip install --user path.py
import re
from IPython.display import display
from pprint import pprint
import netCDF4
from IPython.core.debugger import Pdb
from collections import namedtuple
from random import randint, choice

In [2]:
# load seaborn and other stuff for visualization
import seaborn  # pip install --user seaborn
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
def debug_here():
    Pdb().set_trace()

In [4]:
classes = pd.read_pickle("classes.pkl")
node_pairs = pd.read_pickle("paths_with_classes.pkl").drop("shortest_path", axis=1)

In [5]:
TEST_RESULT_DIRECTORIES = Path("/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results") \
    .dirs()

In [6]:
def read_benchmark_hostnames(path_to_file):
    lines = path_to_file.lines()
    return (re.match(r"^(n\d{5})\.", line).groups()[0] for line in lines)

In [7]:
TestResults = namedtuple("TestResults", ["hostnames", "medians", "msg_lengths"])

In [8]:
def import_data(directory):
    hostnames = tuple(read_benchmark_hostnames(directory.joinpath("network_hosts.txt")))
    with netCDF4.Dataset(directory.joinpath("network_median.nc"), "r")  as dataset:
        step_len = dataset["step_length"][0]
        start_len = dataset["begin_mes_length"][0]
        end_len = dataset["end_mes_length"][0]
        
        assert len(hostnames) == dataset["proc_num"][0]
        assert dataset["test_type"][0] == 1
        assert start_len == 0
        assert end_len == 10000  # last message length should be 9900
        assert step_len == 100
        steps = (end_len - start_len) // step_len - 1
        assert start_len + (steps + 1) * step_len == end_len
        
        lengths = range(start_len, end_len, step_len)
        
        data = {
            length: pd.DataFrame(dataset["data"][index], index=hostnames, columns=hostnames)
            for (index, length) in enumerate(lengths)
        }
        panel = pd.Panel(data)  # most top-level index is length of message
    return TestResults(hostnames=hostnames, medians=panel, msg_lengths=list(lengths))

In [9]:
test_results = import_data(TEST_RESULT_DIRECTORIES[0])

## Make sure that all classes were covered by our test

In [10]:
def check_all_classes_covered(hostnames):
    """Fails an assertion if there is a class of pairs
    that was not covered by the test"""
    pairs_tested = node_pairs[
        node_pairs["node1"].isin(hostnames) &
        node_pairs["node2"].isin(hostnames)
    ]
    assert len(pairs_tested["class_"].unique()) == len(classes)

In [11]:
check_all_classes_covered(test_results.hostnames)

## More helper functions

In [12]:
class ClassGetter():
    def __init__(self, node_pairs):
        self._reverse_lookup_table = node_pairs.set_index(["node1", "node2"])
    
    def __call__(self, node1, node2):
        min_node = min(node1, node2)
        max_node = max(node1, node2)
        return self._reverse_lookup_table.loc[min_node].loc[max_node].loc["class_"]       

In [13]:
get_class = ClassGetter(node_pairs)

In [14]:
def uniques_in_matrix(matrix):
    return frozenset(matrix[col].loc[row] for col in matrix.columns for row in matrix.index)

## Test uniques_in_matrix

In [15]:
def test_uniques_in_matrix():
    uniques1 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["c1", "c2", "c3"], index=["i1", "i2", "i3"])
    )
    assert frozenset({1, 2, 3, 4, 5, 6, 7, 8, 9}) == uniques1
    
    uniques2 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["i1", "i2", "i3"], index=["i1", "i2", "i3"])
    )
    assert uniques2 == uniques1

In [16]:
test_uniques_in_matrix()

## Count unique values in medians matrices

In [17]:
def count_unique_medians(medians):
    uniques_counts = [len(uniques_in_matrix(medians.iloc[i])) for i in range(len(medians))]
    ind_with_min_count, min_count = min(enumerate(uniques_counts), key=lambda pair: pair[1])
    ind_with_max_count, max_count = max(enumerate(uniques_counts), key=lambda pair: pair[1])
    print(
        "Minimum number of unique values in matrix is {0}. Message length = {1}."
            .format(min_count, medians.keys()[ind_with_min_count])
    )
    print(
        "Maximum number of unique values in matrix is {0}. Message length = {1}."
            .format(max_count, medians.keys()[ind_with_max_count])
    )

In [18]:
def print_all_unique_counts():
    for directory in TEST_RESULT_DIRECTORIES:
        medians = import_data(directory).medians
        print(directory.basename())
        count_unique_medians(medians)

# print_all_unique_counts()  # this takes a lot of time

## Prediction Algorithm

In [19]:
def matrix_to_table(matrix):
    table = matrix.stack().reset_index()
    table.columns = ["node1", "node2", "ping"]
    return table

In [91]:
class Predictor():
    """Predicts ping for a packet with specific message_size between 2 nodes,
    measured in seconds."""
    def __init__(self, test_results):
        # construct extended node_pairs table for join
        inversed_node_pairs = node_pairs.rename(columns={"node1": "node2", "node2": "node1"})
        node_pairs_dup = pd.concat(
            [node_pairs, inversed_node_pairs],
            ignore_index = True, verify_integrity=True
        ) \
            .drop_duplicates(subset=["node1", "node2"]) \
            .set_index(["node1", "node2"], verify_integrity=True)
        
        # prepare pings matrices for join
        pings = (matrix_to_table(df).set_index(["node1", "node2"], verify_integrity=True)
                 for (_, df) in test_results.medians.iteritems())
        
        # build tables with 2 columns each: class_, ping. There will be many rows with same class_
        pings_classes = (
            df.join(node_pairs_dup, how="left").reset_index(drop=True)
            for df in pings
        )
        
        # reverse lookup table (by message length and class)
        self._data = pd.concat(
            {
                msg_length: df.groupby("class_").mean()
                for (msg_length, df) in zip(test_results.msg_lengths, pings_classes)
            },
            names=["msg_len", "class_"]
        ).rename(columns={"ping": "mean_of_medians_ping"})
    
    def predict(self, msg_len, node1, node2):
        """This function takes about 1ms"""
        return self._data.loc[msg_len, get_class(node1, node2)][0]

In [92]:
predictor = Predictor(test_results)

### Let's try predicting stuff

In [93]:
assert predictor.predict(4000, "n48003", "n48009") == 5.3229932539018043e-06

## Measure prediction speed

In [51]:
print(*node_pairs.iloc[5][["node1", "node2"]])

n48001 n48006


In [58]:
%%timeit -r1
predictor.predict(
    choice(test_results.msg_lengths),
    *node_pairs.iloc[randint(0, len(node_pairs) - 1)][["node1", "node2"]]
)

1000 loops, best of 1: 1.05 ms per loop


## Testing stuff

In [98]:
size_class_pairs=pd.DataFrame(
    [[500, 4, "foo"], [400, 3, "bar"], [6300, 0, "bazz"], [400, 6, "hi"]],
    columns=["msg_len", "class_", "text"]
).set_index(["msg_len", "class_"], drop=True, verify_integrity=True).sort_index() \
    .drop(["text"], axis=1)

In [99]:
size_class_pairs.sort_index()

msg_len,class_
400,3
400,6
500,4
6300,0


In [100]:
predictor._data.join(size_class_pairs, how="right")  # not implemented :()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_of_medians_ping
msg_len,class_,Unnamed: 2_level_1
400,3,5e-06
400,6,5e-06
500,4,5e-06
6300,0,0.0


## Validation

### Validate on all other tests results