In [1]:
import pandas as pd
import numpy as np
from path import Path  # pip install --user path.py
import re
from IPython.display import display
from pprint import pprint
import netCDF4
from IPython.core.debugger import Pdb
from collections import namedtuple

In [2]:
# load seaborn and other stuff for visualization
import seaborn  # pip install --user seaborn
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
def debug_here():
    Pdb().set_trace()

In [4]:
classes = pd.read_pickle("classes.pkl")
node_pairs = pd.read_pickle("paths_with_classes.pkl").drop("shortest_path", axis=1)

In [5]:
TEST_RESULT_DIRECTORIES = Path("/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results") \
    .dirs()

In [6]:
def read_benchmark_hostnames(path_to_file):
    lines = path_to_file.lines()
    return (re.match(r"^(n\d{5})\.", line).groups()[0] for line in lines)

In [7]:
TestResults = namedtuple("TestResults", ["hostnames", "medians", "msg_lengths"])

In [8]:
def import_data(directory):
    hostnames = tuple(read_benchmark_hostnames(directory.joinpath("network_hosts.txt")))
    with netCDF4.Dataset(directory.joinpath("network_median.nc"), "r")  as dataset:
        step_len = dataset["step_length"][0]
        start_len = dataset["begin_mes_length"][0]
        end_len = dataset["end_mes_length"][0]
        
        assert len(hostnames) == dataset["proc_num"][0]
        assert dataset["test_type"][0] == 1
        assert start_len == 0
        assert end_len == 10000  # last message length should be 9900
        assert step_len == 100
        steps = (end_len - start_len) // step_len - 1
        assert start_len + (steps + 1) * step_len == end_len
        
        lengths = range(start_len, end_len, step_len)
        
        data = {
            "message_len_{0}".format(length): \
                pd.DataFrame(dataset["data"][index], index=hostnames, columns=hostnames)
            for (index, length) in enumerate(lengths)
        }
        panel = pd.Panel(data)
    return TestResults(hostnames=hostnames, medians=panel, msg_lengths=list(lengths))

In [9]:
test_results = import_data(TEST_RESULT_DIRECTORIES[0])

## Make sure that all classes were covered by our test

In [10]:
def check_all_classes_covered(hostnames):
    """Fails an assertion if there is a class of pairs
    that was not covered by the test"""
    pairs_tested = node_pairs[
        node_pairs["node1"].isin(hostnames) &
        node_pairs["node2"].isin(hostnames)
    ]
    assert len(pairs_tested["class_"].unique()) == len(classes)

In [11]:
check_all_classes_covered(test_results.hostnames)

## More helper functions

In [12]:
class ClassGetter():
    def __init__(self, node_pairs):
        self.reverse_lookup_table = node_pairs.set_index(["node1", "node2"])
    
    def __call__(self, node1, node2):
        min_node = min(node1, node2)
        max_node = max(node1, node2)
        return self.reverse_lookup_table.loc[min_node].loc[max_node].loc["class_"]       

In [13]:
get_class = ClassGetter(node_pairs)

In [14]:
def get_values_from_class(matrix, class_):
    """returns an iterable with values from pairs
    that belong to a specific class"""
    hostnames = list(matrix.index)
    assert (matrix.columns == matrix.index).all()
    
    return (
        matrix.loc[node1].loc[node2] for node1 in hostnames
        for node2 in matrix.columns  
        if node1 != node2 and get_class(node1, node2) == class_  # dirty hack
    )

In [15]:
def uniques_in_matrix(matrix):
    return frozenset(matrix[col].loc[row] for col in matrix.columns for row in matrix.index)

## Test uniques_in_matrix

In [22]:
def test_uniques_in_matrix():
    uniques1 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["c1", "c2", "c3"], index=["i1", "i2", "i3"])
    )
    assert frozenset({1, 2, 3, 4, 5, 6, 7, 8, 9}) == uniques1
    
    uniques2 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["i1", "i2", "i3"], index=["i1", "i2", "i3"])
    )
    assert uniques2 == uniques1

In [23]:
test_uniques_in_matrix()

## Count unique values in medians matrices

In [42]:
def count_unique_medians(medians):
    uniques_counts = [len(uniques_in_matrix(medians.iloc[i])) for i in range(len(medians))]
    ind_with_min_count, min_count = min(enumerate(uniques_counts), key=lambda pair: pair[1])
    ind_with_max_count, max_count = max(enumerate(uniques_counts), key=lambda pair: pair[1])
    print(
        "Minimum number of unique values in matrix is {0}. Message length = {1}."
            .format(min_count, medians.keys()[ind_with_min_count])
    )
    print(
        "Maximum number of unique values in matrix is {0}. Message length = {1}."
            .format(max_count, medians.keys()[ind_with_max_count])
    )

In [None]:
for directory in TEST_RESULT_DIRECTORIES:
    medians = import_data(directory).medians
    print(directory.basename())
    count_unique_medians(medians)

2016-11-04-lom2_100_nodes


In [43]:
count_unique_medians(test_results.medians)

Minimum number of unique values in matrix is 5. Message length = message_len_600.
Maximum number of unique values in matrix is 38. Message length = message_len_6200.


In [44]:
TEST_RESULT_DIRECTORIES[0]

Path('/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results/2016-11-04-lom2_100_nodes')

## Prediction Algorithm

In [None]:
def make_predictor(test_results):
    data = {
        {class_: ?? for class_ in classes.index}
        for length in test_results.msg_lengths
    }

In [None]:
def predict(test_results, message_length, node1, node2):
    