In [1]:
import re
from openpyxl import load_workbook # pip install --user openpyxl
from itertools import chain
import pandas as pd
from IPython.display import display
from bidict import frozenbidict # pip install --user bidict
from functools import reduce

In [2]:
# path to "cable journal" excel file
# it's a MS Excel spreadsheet with a list of node connections
# in Lomonosov 2 cluster
# I am not allowed to share it.
SPREADSHEET_FILENAME = r'wire_journal_48_53.xlsx'

In [3]:
# regex for parsing rack number and other numbers from cells
# with switch names in the spreadsheet
switch_regex = re.compile(
    r"""
    КГК\.       # literally match what is written here
    (?P<rack>\d+)\.        # rack number is one or more digits, followed by dot
    (?P<second_number>\d+)\.            # then goes another non-negative integer followed by dot
    (?P<last_number>\d+)            # and another integer of the same form
    """,
    re.VERBOSE)

assert switch_regex.match("КГК.63.2.4").groups() == ("63", "2", "4")

## Parse data from the spreadsheet using openpyxl

In [4]:
def get_column_name(column):
    """Takes column as tuple as argument and returns
    its name as string"""
    return column[0].column

In [5]:
def extract_columns(worksheet, column_names):
    """
    parameters:
        worksheet -- worksheet
        column_names -- list of strings, for example
            ['A', 'C', 'E']
    returns:
        list of columns, where every column is represented
        as a tuple"""
    all_columns = worksheet.columns
    extracted_columns = [col for col in all_columns
                         if get_column_name(col) in column_names]
    assert len(extracted_columns) == len(column_names)
    return extracted_columns

In [6]:
def columns_to_tuples(columns):
    """parameters:
        columns -- columns as a tuple/list of tuples
    returns:
        list of lists/tuples, each one represents a row"""
    return [[cell.value for cell in row] for row in zip(*columns)]

In [7]:
def parse_switch_pairs(workbook):
    """Parse openpyxl workbook and extract a list of
    pairs of switches. Pair (A, B) means that swithes A
    and B are connected.
    Returns list of pairs of strings."""
    return list(chain(*[columns_to_tuples(extract_columns(worksheet, ['C', 'K']))
            for worksheet in workbook]))

In [8]:
# Lomonosov2's racks are grouped into pairs
# Switches in the same rack or pair of racks are connected with copper wires
# Switches in different pairs of racks are connected with optic cable
RACK_PAIRS = ((48, 49), (50, 51), (52, 53))

In [9]:
def get_rack(switch_name):
    """Determines rack number from switch name"""
    return int(switch_regex.match(switch_name).group('rack'))

assert get_rack('КГК.48.0.1') == 48

In [10]:
def determine_material_between_switches(rack1, rack2):
    """Switches have different material between them.
    See comment about RACK_PAIRS.
    
    This function determines cable material between two switches
    by using their rack numbers and returns it as string"""
    racks = (rack1, rack2)
    if any(
            all(rack in rack_pair for rack in racks)
            for rack_pair in RACK_PAIRS):
        # they are in the same pair of racks
        return 'copper'
    return 'optic'

In [11]:
def get_cable_material_from_row(row):
    rack1 = row.loc["switch1_rack"]
    rack2 = row.loc["switch2_rack"]
    return determine_material_between_switches(rack1, rack2)

In [12]:
def get_second_number(switch_name):
    return switch_regex.match(switch_name).groups()[1]

In [13]:
def get_third_number(switch_name):
    return switch_regex.match(switch_name).groups()[2]

In [14]:
workbook = load_workbook(SPREADSHEET_FILENAME)

  warn(msg)


In [15]:
switch_pairs = parse_switch_pairs(workbook)

In [16]:
switches = pd.DataFrame({
    "name": sorted(list(frozenset(chain(*switch_pairs))))
})
switches["rack_number"] = switches["name"].apply(get_rack)
switches["second_number"] = switches["name"].apply(get_second_number)
switches["third_number"] = switches["name"].apply(get_third_number)

In [17]:
switches

Unnamed: 0,name,rack_number,second_number,third_number
0,КГК.48.0.1,48,0,1
1,КГК.48.0.2,48,0,2
2,КГК.48.0.3,48,0,3
3,КГК.48.0.4,48,0,4
4,КГК.48.1.1,48,1,1
5,КГК.48.1.2,48,1,2
6,КГК.48.1.3,48,1,3
7,КГК.48.1.4,48,1,4
8,КГК.48.2.1,48,2,1
9,КГК.48.2.2,48,2,2


In [18]:
switch_to_switch_connections = (
    # add rack numbers for switch1 column
    pd.merge(
        # convert list of pairs to DataFrame
        pd.DataFrame.from_records(switch_pairs, columns=["switch1", "switch2"]),
        switches,
        left_on=["switch1"], right_on=["name"])
    .rename(columns={"rack_number": "switch1_rack"})
    [["switch1", "switch2", "switch1_rack"]]
    
    # add rack numbers for switch2 column
    .merge(
        switches,
        left_on=["switch2"], right_on=["name"])
    .rename(columns={"rack_number": "switch2_rack"})
    [["switch1", "switch2", "switch1_rack", "switch2_rack"]]
    
    # add cable type
    .assign(cable_type=lambda df: df.apply(
        lambda row: determine_material_between_switches(row["switch1_rack"], row["switch2_rack"]),
        axis=1
    ))
    .drop(["switch1_rack", "switch2_rack"], axis=1)
)

In [19]:
switch_to_switch_connections

Unnamed: 0,switch1,switch2,cable_type
0,КГК.48.0.1,КГК.48.0.3,copper
1,КГК.48.0.1,КГК.48.3.3,copper
2,КГК.48.0.3,КГК.48.3.3,copper
3,КГК.48.1.1,КГК.48.3.3,copper
4,КГК.48.1.3,КГК.48.3.3,copper
5,КГК.48.2.1,КГК.48.3.3,copper
6,КГК.48.2.3,КГК.48.3.3,copper
7,КГК.48.3.1,КГК.48.3.3,copper
8,КГК.48.0.1,КГК.48.1.1,copper
9,КГК.48.0.3,КГК.48.1.1,copper


Now let's build table with connections between switches and computational
nodes connected directly to them.

In [20]:
def get_matching_computational_nodes(switch):
    """params:
        switch -- string, name of the switch
    returns:
        list of strings which are names of computational
        nodes connected to this switch"""
    get_thingie = switch_regex.match(switch).group
    return [
        'n{0}{1}{2:02d}'.format(
            get_thingie('rack'),
            get_thingie('second_number'),
            (int(get_thingie('last_number')) - 1) * 8 + i
        )
        for i in range(1, 9)
    ]

assert get_matching_computational_nodes("КГК.48.2.3") == [
    'n48217',
    'n48218',
    'n48219',
    'n48220',
    'n48221',
    'n48222',
    'n48223',
    'n48224'
]

In [21]:
comp_node_to_switch_connects = pd.concat(
    (pd.DataFrame.from_dict({
        "computational_node": get_matching_computational_nodes(switch),
        "switch": switch})
    for switch in switches["name"]),
    ignore_index=True
)
assert len(comp_node_to_switch_connects) == 1536

In [22]:
comp_node_to_switch_connects

Unnamed: 0,computational_node,switch
0,n48001,КГК.48.0.1
1,n48002,КГК.48.0.1
2,n48003,КГК.48.0.1
3,n48004,КГК.48.0.1
4,n48005,КГК.48.0.1
5,n48006,КГК.48.0.1
6,n48007,КГК.48.0.1
7,n48008,КГК.48.0.1
8,n48009,КГК.48.0.2
9,n48010,КГК.48.0.2


Now let's build table with all graph edges, that is a single table
with switch-switch and switch-comp_node connections.

In [23]:
edges = pd.concat([
    switch_to_switch_connections
        .rename(columns={
            "switch1": "node1",
            "switch2": "node2",
            "cable_type": "connection_type"
        }),
    comp_node_to_switch_connects
        .rename(columns={
            "computational_node": "node1",
            "switch": "node2"
        })
        .assign(connection_type="backplane")
])
edges["connection_type"] = edges["connection_type"].astype("category")

In [24]:
edges

Unnamed: 0,node1,node2,connection_type
0,КГК.48.0.1,КГК.48.0.3,copper
1,КГК.48.0.1,КГК.48.3.3,copper
2,КГК.48.0.3,КГК.48.3.3,copper
3,КГК.48.1.1,КГК.48.3.3,copper
4,КГК.48.1.3,КГК.48.3.3,copper
5,КГК.48.2.1,КГК.48.3.3,copper
6,КГК.48.2.3,КГК.48.3.3,copper
7,КГК.48.3.1,КГК.48.3.3,copper
8,КГК.48.0.1,КГК.48.1.1,copper
9,КГК.48.0.3,КГК.48.1.1,copper


Now let's make a table with all nodes and all their properties

In [25]:
nodes = pd.concat([
    switches[["name"]]
        .assign(type_="switch"),
    comp_node_to_switch_connects[["computational_node"]]
        .rename(columns={"computational_node": "name"})
        .assign(type_="computational")
])
nodes["type_"] = nodes["type_"].astype("category")

In [26]:
nodes

Unnamed: 0,name,type_
0,КГК.48.0.1,switch
1,КГК.48.0.2,switch
2,КГК.48.0.3,switch
3,КГК.48.0.4,switch
4,КГК.48.1.1,switch
5,КГК.48.1.2,switch
6,КГК.48.1.3,switch
7,КГК.48.1.4,switch
8,КГК.48.2.1,switch
9,КГК.48.2.2,switch


Now I want to make copies of these 2 tables with all nodes' and edges' properties
replaced with numbers. Categorical values (like computational, switch) or
(copper, backplane, optic) should be translated to numbers like 0, 1, 2.

Then I will calculate shortest path for every pair of nodes (we will assume that
every edge has the same weight) and I will write down a sequence of numbers which
will mean the sequence of properties of all nodes and edges in the shortest
path.

E.g. `(comp_node) --backplane-- (switch) --optic-- (switch) --backplane-- (comp_node)`
might get translated to **1,2,0,1,0,2,1**.

In [27]:
def map_categorial_sequence_to_numbers(sequence):
    """Takes a sequence of values as input.
    Maps each unique value to an integer.
    
    Returns a tuple: (
      the same sequence as numpy.ndarray of resulting integers
      ,
      mapping
    )"""
    new_sequence, unique_values = pd.factorize(sequence, sort=True)
    return new_sequence, frozenbidict(enumerate(unique_values)).inv

In [28]:
def map_categorial_df_to_numbers(df, columns):
    """Arguments:
    df -- pandas.DataFrame in which we will transform columns (not inplace)
    columns -- sequence of column names
    
    Returns a tuple (new_data_frame, mappings)
    new_data_frame is a DataFrame with `columns` values replaced with
      what they were mapped to
    mappings - a sequence of frozenbidicts with mappings of values,
    one for each column. These bidicts are ordered the same way
    as the argument `columns`."""
    assert type(df) == pd.DataFrame
    mappings = []
    new_df = df.copy()
    for column in columns:
        new_df[column], mapping = map_categorial_sequence_to_numbers(df[column])
        mappings.append(mapping)
    return new_df, mappings

In [29]:
def test_map_categorial_df_to_numbers():
    df = pd.DataFrame.from_records(
        [
            ("john", "male", "high"),
            ("lisa", "female", "low"),
            ("jack", "male", "low"),
            ("anna", "female", "medium"),
            ("boris", "male", "medium")
        ],
        columns=["name", "sex", "height"]
    )
    new_df, mappings = map_categorial_df_to_numbers(df, ["height", "sex"])
    display(mappings)
    display(new_df)

#test_map_categorial_df_to_numbers()

In [30]:
nodes_with_numerical_properties, nodes_mappings = map_categorial_df_to_numbers(nodes, ["type_"])

In [31]:
display(nodes_mappings)
display(nodes_with_numerical_properties)

[frozenbidict({'computational': 0, 'switch': 1})]

Unnamed: 0,name,type_
0,КГК.48.0.1,1
1,КГК.48.0.2,1
2,КГК.48.0.3,1
3,КГК.48.0.4,1
4,КГК.48.1.1,1
5,КГК.48.1.2,1
6,КГК.48.1.3,1
7,КГК.48.1.4,1
8,КГК.48.2.1,1
9,КГК.48.2.2,1
