In [1]:
import pandas as pd
import numpy as np
from path import Path  # pip install --user path.py
import re
from IPython.display import display
from pprint import pprint
import netCDF4
from IPython.core.debugger import Pdb
from collections import namedtuple
from random import randint, choice

In [2]:
# load seaborn and other stuff for visualization
#import seaborn  # pip install --user seaborn
#from matplotlib import pyplot as plt
#%matplotlib inline

In [3]:
def debug_here():
    Pdb().set_trace()

In [4]:
classes = pd.read_pickle("classes.pkl")
node_pairs = pd.read_pickle("paths_with_classes.pkl").drop("shortest_path", axis=1)

In [5]:
TEST_RESULT_DIRECTORIES = Path("/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results") \
    .dirs()

In [6]:
def read_benchmark_hostnames(path_to_file):
    lines = path_to_file.lines()
    return (re.match(r"^(n\d{5})\.", line).groups()[0] for line in lines)

In [7]:
TestResults = namedtuple("TestResults", ["hostnames", "medians", "msg_lengths"])

In [10]:
START_MSG_LEN = 0
END_MSG_LEN = 10000
MSG_LEN_STEP = 100
STEPS = (END_MSG_LEN - START_MSG_LEN) // MSG_LEN_STEP - 1

In [11]:
def import_data(directory):
    hostnames = tuple(read_benchmark_hostnames(directory.joinpath("network_hosts.txt")))
    with netCDF4.Dataset(directory.joinpath("network_median.nc"), "r")  as dataset:
        step_len = dataset["step_length"][0]
        start_len = dataset["begin_mes_length"][0]
        end_len = dataset["end_mes_length"][0]
        
        assert len(hostnames) == dataset["proc_num"][0]
        assert dataset["test_type"][0] == 1
        assert start_len == START_MSG_LEN
        assert end_len == END_MSG_LEN  # last message length should be 9900
        assert step_len == MSG_LEN_STEP
        steps = (end_len - start_len) // step_len - 1
        assert start_len + (steps + 1) * step_len == end_len
        
        lengths = range(start_len, end_len, step_len)
        
        data = {
            length: pd.DataFrame(dataset["data"][index], index=hostnames, columns=hostnames)
            for (index, length) in enumerate(lengths)
        }
        panel = pd.Panel(data)  # most top-level index is length of message
    return TestResults(hostnames=hostnames, medians=panel, msg_lengths=list(lengths))

In [12]:
test_results = import_data(TEST_RESULT_DIRECTORIES[0])

## Make sure that all classes were covered by our test

In [13]:
def check_all_classes_covered(hostnames):
    """Fails an assertion if there is a class of pairs
    that was not covered by the test"""
    pairs_tested = node_pairs[
        node_pairs["node1"].isin(hostnames) &
        node_pairs["node2"].isin(hostnames)
    ]
    assert len(pairs_tested["class_"].unique()) == len(classes)

In [14]:
check_all_classes_covered(test_results.hostnames)

## Build extended_node_pairs with index

In [15]:
def build_extended_node_pairs(node_pairs):
    inversed_node_pairs = node_pairs.rename(columns={"node1": "node2", "node2": "node1"})
    return pd.concat(
        [node_pairs, inversed_node_pairs],
        ignore_index = True, verify_integrity=True
    ) \
        .drop_duplicates(subset=["node1", "node2"]) \
        .set_index(["node1", "node2"], verify_integrity=True)

In [16]:
extended_node_pairs = build_extended_node_pairs(node_pairs)

## More helper functions

In [17]:
def get_class(node1, node2):
    return extended_node_pairs.loc[node1].loc[node2].loc["class_"]

In [18]:
def uniques_in_matrix(matrix):
    return frozenset(matrix[col].loc[row] for col in matrix.columns for row in matrix.index)

## Test uniques_in_matrix

In [19]:
def test_uniques_in_matrix():
    uniques1 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["c1", "c2", "c3"], index=["i1", "i2", "i3"])
    )
    assert frozenset({1, 2, 3, 4, 5, 6, 7, 8, 9}) == uniques1
    
    uniques2 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["i1", "i2", "i3"], index=["i1", "i2", "i3"])
    )
    assert uniques2 == uniques1

In [20]:
test_uniques_in_matrix()

## Count unique values in medians matrices

In [21]:
def count_unique_medians(medians):
    uniques_counts = [len(uniques_in_matrix(medians.iloc[i])) for i in range(len(medians))]
    ind_with_min_count, min_count = min(enumerate(uniques_counts), key=lambda pair: pair[1])
    ind_with_max_count, max_count = max(enumerate(uniques_counts), key=lambda pair: pair[1])
    print(
        "Minimum number of unique values in matrix is {0}. Message length = {1}."
            .format(min_count, medians.keys()[ind_with_min_count])
    )
    print(
        "Maximum number of unique values in matrix is {0}. Message length = {1}."
            .format(max_count, medians.keys()[ind_with_max_count])
    )

In [22]:
def print_all_unique_counts():
    for directory in TEST_RESULT_DIRECTORIES:
        medians = import_data(directory).medians
        print(directory.basename())
        count_unique_medians(medians)

# print_all_unique_counts()  # this takes a lot of time

## Prediction Algorithm

In [23]:
def matrix_to_table(matrix):
    table = matrix.stack().reset_index()
    table.columns = ["node1", "node2", "ping"]
    return table

In [24]:
class Predictor():
    """Predicts ping for a packet with specific message_size between 2 nodes,
    measured in seconds."""
    def __init__(self, test_results):
        # prepare pings matrices for join
        pings = (matrix_to_table(df).set_index(["node1", "node2"], verify_integrity=True)
                 for (_, df) in test_results.medians.iteritems())
        
        # build tables with 2 columns each: class_, ping. There will be many rows with same class_
        pings_classes = (
            df.join(extended_node_pairs, how="left").reset_index(drop=True)
            for df in pings
        )
        
        # reverse lookup table (by message length and class)
        self._data = pd.concat(
            {
                msg_length: df.groupby("class_").mean()
                for (msg_length, df) in zip(test_results.msg_lengths, pings_classes)
            },
            names=["msg_len", "class_"]
        ).rename(columns={"ping": "mean_of_medians_ping"})
    
    def predict(self, msg_len, node1, node2):
        """This function takes about 1ms"""
        return self._data.loc[msg_len, get_class(node1, node2)][0]
    
    def predict_many(self, df):
        """df must have columns: msg_len, node1, node2"""
        return df \
            .join(extended_node_pairs, on=["node1", "node2"], how="left") \
            .join(predictor._data, on=["msg_len", "class_"], how="left") \
            .rename(columns={"mean_of_medians_ping": "predicted_ping"}) \
            [["node1", "node2", "predicted_ping"]]

In [25]:
predictor = Predictor(test_results)

### Let's try predicting stuff

In [29]:
predictor.predict(3900, "n48003", "n48009")

5.0067901611328125e-06

In [None]:
predictor.get_clas

In [26]:
assert predictor.predict(4000, "n48003", "n48009") == 5.3229932539018043e-06

AssertionError: 

## Measure prediction speed

In [None]:
print(*node_pairs.iloc[5][["node1", "node2"]])

In [None]:
%%timeit -r1
predictor.predict(
    choice(test_results.msg_lengths),
    *node_pairs.iloc[randint(0, len(node_pairs) - 1)][["node1", "node2"]]
)

## Implementing predict_many

In [None]:
def get_random_samples(count):
    return node_pairs.sample(n=count)[["node1", "node2"]] \
        .assign(msg_len=np.random.randint(0, STEPS, size=count) * MSG_LEN_STEP)

In [None]:
df = get_random_samples(100000)

In [None]:
%%timeit
predictor.predict_many(df)

## Validation

### Validate on all other tests results