##### Imports the necessary modules and sets the system path to locate them.

In [1]:
import sys
import networkx
import astra_sim_sdk.astra_sim_sdk as astra_sim_kit
sys.path.append("../../utils")
from astra_sim import AstraSim, Collective, NetworkBackend
from infragraph.infragraph_service import InfraGraphService
from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric
from infragraph.blueprints.devices.server import Server
from infragraph.blueprints.devices.generic_switch import Switch


##### Connects the client to the AstraSim gRPC server, initializes the AstraSim SDK, and creates a folder (tagged as specified) containing all configuration details, generated results, and logs.

In [2]:
astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "htsim_clos_3tier_trial")

Resetting test directory
All contents of the folder /workspaces/astra_sim_service/client-scripts/utils/../trial/htsim_clos_3tier_trial have been deleted.
Successfully connected to gRPC server at 172.17.0.2:8989


##### Creating Infragraph for 3 tier clos fabric

In [3]:
server = Server()
switch = Switch(port_count=8)
clos_fat_tree = ClosFatTreeFabric(switch, server, 3,[])
astra.configuration.infragraph.infrastructure.deserialize(clos_fat_tree.serialize())
print(astra.configuration.infragraph.infrastructure)

description: Clos Fat Tree Fabric
devices:
- components:
  - choice: cpu
    count: 1
    description: Generic CPU
    name: cpu
  - choice: xpu
    count: 2
    description: Generic GPU/XPU
    name: xpu
  - choice: switch
    count: 1
    description: NVLink Switch
    name: nvlsw
  - choice: switch
    count: 1
    description: PCI Express Switch Gen 4
    name: pciesw
  - choice: nic
    count: 2
    description: Generic Nic
    name: nic
  - choice: custom
    count: 1
    custom:
      type: mgmt-nic
    description: Mgmt Nic
    name: mgmt
  description: A generic server with npu_factor * 4 xpu(s)
  edges:
  - ep1:
      component: mgmt
    ep2:
      component: cpu[0]
    link: pcie
    scheme: one2one
  - ep1:
      component: cpu
    ep2:
      component: cpu
    link: fabric
    scheme: many2many
  - ep1:
      component: xpu
    ep2:
      component: nvlsw
    link: nvlink
    scheme: many2many
  - ep1:
      component: cpu[0]
    ep2:
      component: pciesw[0]
    link: p

##### Initialize Infragraph service and Display Fabric

In [4]:
service = InfraGraphService()
service.set_graph(clos_fat_tree)
g = service.get_networkx_graph()
print(networkx.write_network_text(g, vertical_chains=True))

total_npus = 64

╙── server.25.mgmt.0
    │
    server.25.cpu.0
    │
    server.25.pciesw.0
    ├── server.25.xpu.0
    │   │
    │   server.25.nvlsw.0
    │   │
    │   server.25.xpu.1 ─ server.25.pciesw.0
    ├── server.25.nic.0
    │   │
    │   tier_0.12.port.2
    │   │
    │   tier_0.12.asic.0
    │   ├── tier_0.12.port.0
    │   │   │
    │   │   server.24.nic.0
    │   │   │
    │   │   server.24.pciesw.0
    │   │   ├── server.24.cpu.0
    │   │   │   │
    │   │   │   server.24.mgmt.0
    │   │   ├── server.24.xpu.0
    │   │   │   │
    │   │   │   server.24.nvlsw.0
    │   │   │   │
    │   │   │   server.24.xpu.1 ─ server.24.pciesw.0
    │   │   ├── server.24.nic.1
    │   │   │   │
    │   │   │   tier_0.12.port.1 ─ tier_0.12.asic.0
    │   │   └──  ...
    │   ├── tier_0.12.port.3
    │   │   │
    │   │   server.25.nic.1 ─ server.25.pciesw.0
    │   ├── tier_0.12.port.4
    │   │   │
    │   │   tier_1.12.port.0
    │   │   │
    │   │   tier_1.12.asic.0
    │   │   ├── tier_1.12.port.

##### Generates workload execution traces for each rank and configures the data size, which is mandatory for AstraSim workload configuration.

In [5]:
astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])

Generated 64 et in /workspaces/astra_sim_service/client-scripts/utils/../trial/htsim_clos_3tier_trial/configuration/workload


##### Configure the system configurations

In [6]:
astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO
astra.configuration.common_config.system.endpoint_delay = 10
astra.configuration.common_config.system.active_chunks_per_dimension = 1
astra.configuration.common_config.system.preferred_dataset_splits = 4
astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]
astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]
astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING]
astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE
astra.configuration.common_config.system.local_mem_bw = 1600
astra.configuration.common_config.system.peak_perf = 900
astra.configuration.common_config.system.roofline_enabled = 0
print(astra.configuration.common_config.system)



active_chunks_per_dimension: 1
all_gather_implementation:
- ring
all_reduce_implementation:
- ring
all_to_all_implementation:
- direct
collective_optimization: localBWAware
endpoint_delay: 10
local_mem_bw: 1600
local_reduction_delay: 0
peak_perf: 900
preferred_dataset_splits: 4
reduce_scatter_implementation:
- ring
roofline_enabled: 0
scheduling_policy: LIFO
trace_enabled: 0



##### Configure the remote memory configuration

In [7]:
astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION
print(astra.configuration.common_config.remote_memory)

memory_type: NO_MEMORY_EXPANSION
remote_mem_bw: 0
remote_mem_latency: 0



##### Configure the network backend choice and the topology choice for that backend


In [8]:
astra.configuration.network_backend.choice = astra.configuration.network_backend.HTSIM
astra.configuration.network_backend.htsim.topology.choice = astra.configuration.network_backend.htsim.topology.INFRAGRAPH
# astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)


##### Configure the protocol choice

In [9]:
astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP
print("Network backend set to", astra.configuration.network_backend.choice)
print("network topology choice set to:",astra.configuration.network_backend.htsim.topology.choice)
print("protocol set to", astra.configuration.network_backend.htsim.htsim_protocol)
astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = str(total_npus)

Network backend set to htsim
network topology choice set to: infragraph
protocol set to choice: tcp



##### Adding ASTRA-sim specific annotation

In [10]:
host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()
host_device_spec.device_bandwidth_gbps = 1000
host_device_spec.device_latency_ms = 0.005
host_device_spec.device_name = "server"
host_device_spec.device_type = "host"
astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)

switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()
switch_device_spec.device_bandwidth_gbps = 1000
switch_device_spec.device_latency_ms = 0.005
switch_device_spec.device_name = "switch"
switch_device_spec.device_type = "switch"
astra.configuration.infragraph.annotations.device_specifications.append(
    switch_device_spec
)

<astra_sim_sdk.astra_sim_sdk.AnnotationDeviceSpecificationsIter at 0x7cb086666300>

##### Configure ASTRA-sim cmd parameters

In [11]:
astra.configuration.common_config.cmd_parameters.comm_scale = 1
astra.configuration.common_config.cmd_parameters.injection_scale = 1
astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False

#### Start the simulation by providing the network backend name in uppercase letters.

In [12]:
astra.run_simulation(NetworkBackend.HTSIM)

Generating Configuration ZIP
output_path: /workspaces/astra_sim_service/client-scripts/utils/../trial/htsim_clos_3tier_trial/config.zip
folder_path: /workspaces/astra_sim_service/client-scripts/utils/../trial/htsim_clos_3tier_trial/configuration
  group message from schema - communicator group configuration empty'

message: Simulation started successfully

astra-sim server Status: running
astra-sim server Status: running
astra-sim server Status: running
Downloading Output files....
Transferring Files from ASTRA-sim server
Downloading file: simulation.log
All files downloaded Successfully
Simulation completed


##### Download all the configurations as a zip

In [13]:
astra.download_configuration()

Downloaded all configuration in /workspaces/astra_sim_service/client-scripts/utils/../trial/htsim_clos_3tier_trial/server_configuration.zip


##### Save infragraph as a yaml

In [14]:
import yaml
import os
from common import FileFolderUtils
with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","2tier.yaml"),"w") as f:
    data = clos_fat_tree.serialize("dict")
    yaml.dump(data, f, default_flow_style=False, indent=4)

print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","3tier.yaml"))

saved yaml to: /workspaces/astra_sim_service/client-scripts/utils/../trial/htsim_clos_3tier_trial/output/../3tier.yaml
