In [1]:
import re
from openpyxl import load_workbook # pip install --user openpyxl
from itertools import chain
import pandas as pd
from IPython.display import display

In [2]:
# path to "cable journal" excel file
# it's a MS Excel spreadsheet with a list of node connections
# in Lomonosov 2 cluster
# I am not allowed to share it.
SPREADSHEET_FILENAME = r'wire_journal_48_53.xlsx'

In [3]:
# regex for parsing rack number and other numbers from cells
# with switch names in the spreadsheet
switch_regex = re.compile(
    r"""
    КГК\.       # literally match what is written here
    (?P<rack>\d+)\.        # rack number is one or more digits, followed by dot
    (?P<second_number>\d+)\.            # then goes another non-negative integer followed by dot
    (?P<last_number>\d+)            # and another integer of the same form
    """,
    re.VERBOSE)

assert switch_regex.match("КГК.63.2.4").groups() == ("63", "2", "4")

## Parse data from the spreadsheet using openpyxl

In [4]:
def get_column_name(column):
    """Takes column as tuple as argument and returns
    its name as string"""
    return column[0].column

In [5]:
def extract_columns(worksheet, column_names):
    """
    parameters:
        worksheet -- worksheet
        column_names -- list of strings, for example
            ['A', 'C', 'E']
    returns:
        list of columns, where every column is represented
        as a tuple"""
    all_columns = worksheet.columns
    extracted_columns = [col for col in all_columns
                         if get_column_name(col) in column_names]
    assert len(extracted_columns) == len(column_names)
    return extracted_columns

In [6]:
def columns_to_tuples(columns):
    """parameters:
        columns -- columns as a tuple/list of tuples
    returns:
        list of lists/tuples, each one represents a row"""
    return [[cell.value for cell in row] for row in zip(*columns)]

In [7]:
def parse_switch_pairs(workbook):
    """Parse openpyxl workbook and extract a list of
    pairs of switches. Pair (A, B) means that swithes A
    and B are connected.
    Returns list of pairs of strings."""
    return list(chain(*[columns_to_tuples(extract_columns(worksheet, ['C', 'K']))
            for worksheet in workbook]))

In [8]:
# Lomonosov2's racks are grouped into pairs
# Switches in the same rack or pair of racks are connected with copper wires
# Switches in different pairs of racks are connected with optic cable
RACK_PAIRS = ((48, 49), (50, 51), (52, 53))

In [58]:
def get_rack(switch_name):
    """Determines rack number from switch name"""
    return int(switch_regex.match(switch_name).group('rack'))

assert get_rack('КГК.48.0.1') == 48

In [59]:
def determine_material_between_switches(rack1, rack2):
    """Switches have different material between them.
    See comment about RACK_PAIRS.
    
    This function determines cable material between two switches
    by using their rack numbers and returns it as string"""
    racks = (rack1, rack2)
    if any(
            all(rack in rack_pair for rack in racks)
            for rack_pair in RACK_PAIRS):
        # they are in the same pair of racks
        return 'copper'
    return 'optic'

In [61]:
def get_cable_material_from_row(row):
    rack1 = row.loc["switch1_rack"]
    rack2 = row.loc["switch2_rack"]
    return determine_material_between_switches(rack1, rack2)

In [114]:
def get_second_number(switch_name):
    return switch_regex.match(switch_name).groups()[1]

In [115]:
def get_third_number(switch_name):
    return switch_regex.match(switch_name).groups()[2]

In [62]:
workbook = load_workbook(SPREADSHEET_FILENAME)

  warn(msg)


In [89]:
switch_pairs = parse_switch_pairs(workbook)

In [116]:
switches = (
    pd.DataFrame({
        "name": sorted(list(frozenset(chain(*x))))
    })
    .assign(rack_number=lambda df: df["name"].apply(get_rack))
    .assign(second_number=lambda df: df["name"].apply(get_second_number))
)

In [132]:
switches = pd.DataFrame({
    "name": sorted(list(frozenset(chain(*x))))
})
switches["rack_number"] = switches["name"].apply(get_rack)
switches["second_number"] = switches["name"].apply(get_second_number)
switches["third_number"] = switches["name"].apply(get_third_number)

In [133]:
switches

Unnamed: 0,name,rack_number,second_number,third_number
0,КГК.48.0.1,48,0,1
1,КГК.48.0.2,48,0,2
2,КГК.48.0.3,48,0,3
3,КГК.48.0.4,48,0,4
4,КГК.48.1.1,48,1,1
5,КГК.48.1.2,48,1,2
6,КГК.48.1.3,48,1,3
7,КГК.48.1.4,48,1,4
8,КГК.48.2.1,48,2,1
9,КГК.48.2.2,48,2,2


In [141]:
switch_to_switch_connections = (
    # add rack numbers for switch1 column
    pd.merge(
        switch_to_switch_connections, switches,
        left_on=["switch1"], right_on=["name"])
    .rename(columns={"rack_number": "switch1_rack"})
    [["switch1", "switch2", "switch1_rack"]]
    
    # add rack numbers for switch2 column
    .merge(
        switches,
        left_on=["switch2"], right_on=["name"])
    .rename(columns={"rack_number": "switch2_rack"})
    [["switch1", "switch2", "switch1_rack", "switch2_rack"]]
    
    # add cable type
    .assign(cable_type=lambda df: df.apply(
        lambda row: determine_material_between_switches(row["switch1_rack"], row["switch2_rack"]),
        axis=1
    ))
    .drop(["switch1_rack", "switch2_rack"], axis=1)
)

In [142]:
switch_to_switch_connections

Unnamed: 0,switch1,switch2,cable_type
0,КГК.48.0.1,КГК.48.0.3,copper
1,КГК.48.0.1,КГК.48.3.3,copper
2,КГК.48.0.3,КГК.48.3.3,copper
3,КГК.48.1.1,КГК.48.3.3,copper
4,КГК.48.1.3,КГК.48.3.3,copper
5,КГК.48.2.1,КГК.48.3.3,copper
6,КГК.48.2.3,КГК.48.3.3,copper
7,КГК.48.3.1,КГК.48.3.3,copper
8,КГК.48.0.1,КГК.48.1.1,copper
9,КГК.48.0.3,КГК.48.1.1,copper


In [143]:
# AFTER THIS EVERYTHING IS BADLY WRITTEN, MUST REFACTOR

In [None]:
df["cable_type"] = df.apply(get_cable_material_from_row, axis=1)
df.tail()

In [None]:
switches = pd.Series(list(frozenset(df["switch1_name"]).union(df["switch2_name"]))) \
    .rename("switch_name").sort_values()

In [None]:
print(len(switches))
switches.head()

Now let's build table with all connections, that is with switches and computational nodes

In [None]:
def get_matching_computational_nodes(switch):
    """params:
        switch -- string, name of the switch
    returns:
        list of strings which are names of computational
        nodes connected to this switch"""
    get_thingie = switch_regex.match(switch).group
    return [
        'n{0}{1}{2:02d}'.format(
            get_thingie('rack'),
            get_thingie('second_number'),
            (int(get_thingie('last_number')) - 1) * 8 + i
        )
        for i in range(1, 9)
    ]

In [None]:
computational_nodes = pd.concat(
    (pd.DataFrame.from_dict({
        "computational_node": get_matching_computational_nodes(switch),
        "switch": switch})
    for switch in switches),
    ignore_index=True
)
assert len(computational_nodes) == 1536

In [None]:
display(computational_nodes.head())
display(computational_nodes.tail())

In [None]:
# let's make the table of all edges
all_edges = df \
    .drop(["switch1_rack", "switch2_rack"], axis=1) \
    .rename(columns={
        "switch1_name": "node1_name",
        "switch2_name": "node2_name",
        "cable_type": "connection_type"})
all_edges["node1_type"] = "switch"
all_edges["node2_type"] = "switch"
all_edges.head()

In [None]:
len(all_edges)

In [None]:
pd.concat([all_edges,
    computational_nodes.rename(columns={
        "computational_node": "node1_name",
        "switch": "node2_name"
    })])