In [73]:
import re
from openpyxl import load_workbook # pip install --user openpyxl
from itertools import chain
import pandas as pd
from IPython.display import display

In [2]:
# path to "cable journal" excel file
# it's a MS Excel spreadsheet with a list of node connections
# in Lomonosov 2 cluster
# I am not allowed to share it.
SPREADSHEET_FILENAME = r'wire_journal_48_53.xlsx'

In [3]:
# regex for parsing rack number and other numbers from cells
# with switch names in the spreadsheet
switch_regex = re.compile(
    r"""
    КГК\.       # literally match what is written here
    (?P<rack>\d+)\.        # rack number is one or more digits, followed by dot
    (?P<second_number>\d+)\.            # then goes another non-negative integer followed by dot
    (?P<last_number>\d+)            # and another integer of the same form
    """,
    re.VERBOSE)

assert switch_regex.match("КГК.63.2.4").groups() == ("63", "2", "4")

## Parse data from the spreadsheet using openpyxl

In [4]:
def get_column_name(column):
    """Takes column as tuple as argument and returns
    its name as string"""
    return column[0].column

In [5]:
def extract_columns(worksheet, column_names):
    """
    parameters:
        worksheet -- worksheet
        column_names -- list of strings, for example
            ['A', 'C', 'E']
    returns:
        list of columns, where every column is represented
        as a tuple"""
    all_columns = worksheet.columns
    extracted_columns = [col for col in all_columns
                         if get_column_name(col) in column_names]
    assert len(extracted_columns) == len(column_names)
    return extracted_columns

In [6]:
def columns_to_tuples(columns):
    """parameters:
        columns -- columns as a tuple/list of tuples
    returns:
        list of lists/tuples, each one represents a row"""
    return [[cell.value for cell in row] for row in zip(*columns)]

In [7]:
def parse_switch_pairs(workbook):
    """Parse openpyxl workbook and extract a list of
    pairs of switches. Pair (A, B) means that swithes A
    and B are connected.
    Returns list of pairs of strings."""
    return list(chain(*[columns_to_tuples(extract_columns(worksheet, ['C', 'K']))
            for worksheet in workbook]))

In [8]:
# Lomonosov2's racks are grouped into pairs
# Switches in the same rack or pair of racks are connected with copper wires
# Switches in different pairs of racks are connected with optic cable
RACK_PAIRS = ((48, 49), (50, 51), (52, 53))

In [9]:
def get_rack(switch_name):
    """Determines rack number from switch name"""
    return int(switch_regex.match(switch_name).group('rack'))

assert get_rack('КГК.48.0.1') == 48

In [18]:
def determine_material_between_switches(rack1, rack2):
    """Switches have different material between them.
    See comment about RACK_PAIRS.
    
    This function determines cable material between two switches
    by using their rack numbers and returns it as string"""
    racks = (rack1, rack2)
    if any(
            all(rack in rack_pair for rack in racks)
            for rack_pair in RACK_PAIRS):
        # they are in the same pair of racks
        return 'copper'
    return 'optic'

In [20]:
def get_cable_material_from_row(row):
    rack1 = row.loc["switch1_rack"]
    rack2 = row.loc["switch2_rack"]
    return determine_material_between_switches(rack1, rack2)

In [21]:
determine_material_between_switches(48, 48)

'copper'

In [22]:
workbook = load_workbook(SPREADSHEET_FILENAME)

  warn(msg)


In [23]:
x = parse_switch_pairs(workbook)

In [24]:
df = pd.DataFrame(x, columns=["switch1_name", "switch2_name"])

In [25]:
df["switch1_rack"] = df["switch1_name"].apply(get_rack)
df["switch2_rack"] = df["switch2_name"].apply(get_rack)

In [27]:
df["cable_type"] = df.apply(get_cable_material_from_row, axis=1)
df.tail()

Unnamed: 0,switch1_name,switch2_name,switch1_rack,switch2_rack,cable_type
1531,КГК.51.1.3,КГК.53.6.2,51,53,optic
1532,КГК.51.2.1,КГК.53.5.4,51,53,optic
1533,КГК.51.2.3,КГК.53.5.2,51,53,optic
1534,КГК.51.3.1,КГК.53.4.4,51,53,optic
1535,КГК.51.3.3,КГК.53.4.2,51,53,optic


In [64]:
switches = pd.Series(list(frozenset(df["switch1_name"]).union(df["switch2_name"]))) \
    .rename("switch_name").sort_values()

In [65]:
print(len(switches))
switches.head()

192


11     КГК.48.0.1
63     КГК.48.0.2
90     КГК.48.0.3
20     КГК.48.0.4
104    КГК.48.1.1
Name: switch_name, dtype: object

Now let's build table with all connections, that is with switches and computational nodes

In [67]:
computational_nodes = pd.DataFrame(columns=["comp_node", "switch"])
computational_nodes

Unnamed: 0,comp_node,switch


In [68]:
def get_matching_computational_nodes(switch):
    """params:
        switch -- string, name of the switch
    returns:
        list of strings which are names of computational
        nodes connected to this switch"""
    get_thingie = switch_regex.match(switch).group
    return [
        'n{0}{1}{2:02d}'.format(
            get_thingie('rack'),
            get_thingie('second_number'),
            (int(get_thingie('last_number')) - 1) * 8 + i
        )
        for i in range(1, 9)
    ]

In [82]:
bzz = pd.concat(
    (pd.DataFrame.from_dict({
        "computational_node": get_matching_computational_nodes(switch),
        "switch": switch})
    for switch in switches),
    ignore_index=True
)

In [75]:
for switch in switches:
    # each switch has 8 computational nodes connected
    comp_nodes = get_matching_computational_nodes(switch)
    x = pd.DataFrame.from_dict(
        {"computational_node": comp_nodes, "switch": switch})
    display(x)
    break

Unnamed: 0,computational_node,switch
0,n48001,КГК.48.0.1
1,n48002,КГК.48.0.1
2,n48003,КГК.48.0.1
3,n48004,КГК.48.0.1
4,n48005,КГК.48.0.1
5,n48006,КГК.48.0.1
6,n48007,КГК.48.0.1
7,n48008,КГК.48.0.1
