##### Imports the necessary modules and sets the system path to locate them.

In [1]:
import sys
sys.path.append("../../utils")
from astra_sim import AstraSim, Collective, NetworkBackend
from infragraph.blueprints.devices.dgx import Dgx
from infragraph.blueprints.fabrics.single_tier_fabric import SingleTierFabric
import networkx
from infragraph.infragraph_service import InfraGraphService
import astra_sim_sdk.astra_sim_sdk as astra_sim_kit


##### Connects the client to the AstraSim gRPC server, initializes the AstraSim SDK, and creates a folder (tagged as specified) containing all configuration details, generated results, and logs.

In [2]:
astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "infragraph_dgx_trial")

Resetting test directory
Successfully connected to gRPC server at 172.17.0.2:8989


##### Creating Infragraph with 2 dgx Hosts & 1 rack switch


In [3]:
dgx_count = 2
fabric = SingleTierFabric(Dgx(), dgx_count)
astra.configuration.infragraph.infrastructure.deserialize(fabric.serialize())
total_npus = dgx_count * 8 # dgx has 8 npus to total npus = dgx_count * npu_count_per_dgx

##### Display Fabric

In [4]:
service = InfraGraphService()
service.set_graph(fabric)

g = service.get_networkx_graph()
print(networkx.write_network_text(g, vertical_chains=True))

╟── dgx.0.cpu.0
╎   │
╎   dgx.0.cpu.1
╟── dgx.1.cpu.0
╎   │
╎   dgx.1.cpu.1
╙── dgx.1.xpu.4
    ├── dgx.1.nvlsw.0
    │   ├── dgx.1.xpu.0
    │   │   │
    │   │   dgx.1.pciesw.0
    │   │   ├── dgx.1.xpu.1 ─ dgx.1.nvlsw.0
    │   │   ├── dgx.1.nic.0
    │   │   │   │
    │   │   │   switch.0.port.8
    │   │   │   │
    │   │   │   switch.0.asic.0
    │   │   │   ├── switch.0.port.0
    │   │   │   │   │
    │   │   │   │   dgx.0.nic.0
    │   │   │   │   │
    │   │   │   │   dgx.0.pciesw.0
    │   │   │   │   ├── dgx.0.xpu.0
    │   │   │   │   │   │
    │   │   │   │   │   dgx.0.nvlsw.0
    │   │   │   │   │   ├── dgx.0.xpu.1 ─ dgx.0.pciesw.0
    │   │   │   │   │   ├── dgx.0.xpu.2
    │   │   │   │   │   │   │
    │   │   │   │   │   │   dgx.0.pciesw.1
    │   │   │   │   │   │   ├── dgx.0.xpu.3 ─ dgx.0.nvlsw.0
    │   │   │   │   │   │   ├── dgx.0.nic.2
    │   │   │   │   │   │   │   │
    │   │   │   │   │   │   │   switch.0.port.2 ─ switch.0.asic.0
    │   │   │   │   │   │   

##### Generates workload execution traces for each rank and configures the data size, which is mandatory for AstraSim workload configuration.

In [5]:
astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0, total_npus])

Generated 16 et in /workspaces/astra_sim_service/client-scripts/utils/../trial/infragraph_dgx_trial/configuration/workload


##### Configure the system configurations

In [6]:
astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO
astra.configuration.common_config.system.endpoint_delay = 10
astra.configuration.common_config.system.active_chunks_per_dimension = 1
astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]
astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]
astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]
astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE
astra.configuration.common_config.system.local_mem_bw = 1600

##### Configure the remote memory configuration

In [7]:
astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION
print(astra.configuration.common_config.remote_memory)

memory_type: NO_MEMORY_EXPANSION
remote_mem_bw: 0
remote_mem_latency: 0



##### Configure the network backend choice and the topology choice for that backend


In [8]:
astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3
astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH
astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)


##### Adding ns3 trace and logical dimension 

In [9]:
astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus]
astra.configuration.network_backend.ns3.trace.trace_ids = []
for i in range(0, total_npus):
    astra.configuration.network_backend.ns3.trace.trace_ids.append(i)

##### Adding ASTRA-sim specific annotation

In [10]:
host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()
host_device_spec.device_bandwidth_gbps = 100
host_device_spec.device_latency_ms = 0.05
host_device_spec.device_name = "dgx"
host_device_spec.device_type = "host"
astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)

switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()
switch_device_spec.device_bandwidth_gbps = 100
switch_device_spec.device_latency_ms = 0.05
switch_device_spec.device_name = "switch"
switch_device_spec.device_type = "switch"
astra.configuration.infragraph.annotations.device_specifications.append(
    switch_device_spec
)

<astra_sim_sdk.astra_sim_sdk.AnnotationDeviceSpecificationsIter at 0x775cde222300>

##### Configure ASTRA-sim cmd parameters

In [11]:
astra.configuration.common_config.cmd_parameters.comm_scale = 1
astra.configuration.common_config.cmd_parameters.injection_scale = 1
astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False

#### Start the simulation by providing the network backend name in uppercase letters.

In [12]:
astra.run_simulation(NetworkBackend.NS3)

Generating Configuration ZIP
output_path: /workspaces/astra_sim_service/client-scripts/utils/../trial/infragraph_dgx_trial/config.zip
folder_path: /workspaces/astra_sim_service/client-scripts/utils/../trial/infragraph_dgx_trial/configuration
  group message from schema - communicator group configuration empty'

message: Simulation started successfully

astra-sim server Status: running
astra-sim server Status: running
Downloading Output files....
Transferring Files from ASTRA-sim server
Downloading file: fct.txt
Downloading file: flow.txt
Downloading file: pfc.txt
Downloading file: qlen.txt
Downloading file: simulation.log
Downloading file: trace_out.tr
All files downloaded Successfully
Translating Metrics...
Generated fct.csv at:  /workspaces/astra_sim_service/client-scripts/utils/../trial/infragraph_dgx_trial/output/fct.csv
Generated: flow_stats.csv at:  /workspaces/astra_sim_service/client-scripts/utils/../trial/infragraph_dgx_trial/output/flow_stats.csv
All metrics translated succes

  df = pd.read_csv(
  df = pd.read_csv(


##### Download all the configurations as a zip

In [13]:
astra.download_configuration()

Downloaded all configuration in /workspaces/astra_sim_service/client-scripts/utils/../trial/infragraph_dgx_trial/server_configuration.zip


##### Read output files

In [14]:
import pandas as pd
import os
from common import FileFolderUtils
df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv"))
df.head()

Unnamed: 0,Source Hex ip,Destination Hex ip,Source Port,Destination Port,Data size (B),Start Time,FCT,Standalone FCT
0,0b000101,0b000201,10000,100,524288,10,63682,63498
1,0b000201,0b000301,10000,100,524288,10,63682,63498
2,0b000301,0b000401,10000,100,524288,10,63682,63498
3,0b000401,0b000501,10000,100,524288,10,63682,63498
4,0b000501,0b000601,10000,100,524288,10,63682,63498


In [15]:
df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv"))
