**System Information**

In [1]:
!nvidia-smi

Tue Jun  3 06:45:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvidia-smi -q



Timestamp                                 : Tue Jun  3 06:53:33 2025
Driver Version                            : 550.54.15
CUDA Version                              : 12.4

Attached GPUs                             : 1
GPU 00000000:00:04.0
    Product Name                          : Tesla T4
    Product Brand                         : NVIDIA
    Product Architecture                  : Turing
    Display Mode                          : Enabled
    Display Active                        : Disabled
    Persistence Mode                      : Disabled
    Addressing Mode                       : None
    MIG Mode
        Current                           : N/A
        Pending                           : N/A
    Accounting Mode                       : Disabled
    Accounting Mode Buffer Size           : 4000
    Driver Model
        Current                           : N/A
        Pending                           : N/A
    Serial Number                         : 1564620003212
    GPU UUID  

In [4]:
# gpu topology
!nvidia-smi topo -m

	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
GPU0	 X 	0-1	0		N/A

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks


In [7]:
%cd /usr/local

/usr/local


In [8]:
!ls

bin    cuda	cuda-12.5	  etc	 include  libexec     man  sbin   src
colab  cuda-12	dist_metrics.pxd  games  lib	  LICENSE.md  opt  share


In [51]:
%cd cuda-12.5

/usr/local/cuda-12.5


**Architecture Type**

In [69]:
%%writefile arch_query.cu
#include <cuda_runtime.h>
#include <stdio.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for (int device = 0; device < deviceCount; device++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, device);

        printf("=== Device %d: %s ===\n", device, prop.name);
        printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("Total Global Memory: %.2f GB\n", prop.totalGlobalMem / (1024.0*1024.0*1024.0));
        printf("Shared Memory per Block: %zu bytes\n", prop.sharedMemPerBlock);
        printf("Registers per Block: %d\n", prop.regsPerBlock);
        printf("Warp Size: %d\n", prop.warpSize);
        printf("Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
        printf("Max Threads Dim: (%d, %d, %d)\n",
               prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("Max Grid Size: (%d, %d, %d)\n",
               prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("Multiprocessor Count: %d\n", prop.multiProcessorCount);
        printf("Max Threads per Multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
        printf("Memory Clock Rate: %.2f MHz\n", prop.memoryClockRate / 1000.0);
        printf("Memory Bus Width: %d bits\n", prop.memoryBusWidth);
        printf("L2 Cache Size: %d bytes\n", prop.l2CacheSize);
        printf("Texture Alignment: %zu\n", prop.textureAlignment);
        printf("Concurrent Kernels: %s\n", prop.concurrentKernels ? "Yes" : "No");
        printf("ECC Enabled: %s\n", prop.ECCEnabled ? "Yes" : "No");
        printf("Unified Addressing: %s\n", prop.unifiedAddressing ? "Yes" : "No");
        printf("\n");
    }
    return 0;
}

Writing arch_query.cu


In [70]:
!nvcc -arch=sm_75 arch_query.cu -o arch_query

In [71]:
!./arch_query

=== Device 0: Tesla T4 ===
Compute Capability: 7.5
Total Global Memory: 14.74 GB
Shared Memory per Block: 49152 bytes
Registers per Block: 65536
Warp Size: 32
Max Threads per Block: 1024
Max Threads Dim: (1024, 1024, 64)
Max Grid Size: (2147483647, 65535, 65535)
Multiprocessor Count: 40
Max Threads per Multiprocessor: 1024
Memory Clock Rate: 5001.00 MHz
Memory Bus Width: 256 bits
L2 Cache Size: 4194304 bytes
Texture Alignment: 512
Concurrent Kernels: Yes
ECC Enabled: Yes
Unified Addressing: Yes



**Memory Architecture Analysis**

In [77]:
%%writefile memory_bandwidth.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>

__global__ void memory_copy_kernel(float *dst, float *src, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        dst[idx] = src[idx];
    }
}

int main() {
    const int N = 1024 * 1024 * 64; // 64M elements = 256MB of float data
    const int bytes = N * sizeof(float);

    float *h_src, *h_dst;
    float *d_src, *d_dst;

    printf("Memory Bandwidth Test (Small Version)\n");
    printf("Data Size: %.2f MB\n", bytes / (1024.0*1024.0));

    // Allocate host memory
    h_src = (float*)malloc(bytes);
    h_dst = (float*)malloc(bytes);

    // Initialize data
    for (int i = 0; i < N; i++) {
        h_src[i] = (float)i;
    }

    // Allocate device memory
    cudaMalloc(&d_src, bytes);
    cudaMalloc(&d_dst, bytes);

    // Multiple runs for better accuracy
    const int num_runs = 5;
    double h2d_total = 0, d2h_total = 0, d2d_total = 0;

    for (int run = 0; run < num_runs; run++) {
        // H2D test
        auto start = std::chrono::high_resolution_clock::now();
        cudaMemcpy(d_src, h_src, bytes, cudaMemcpyHostToDevice);
        cudaDeviceSynchronize();
        auto end = std::chrono::high_resolution_clock::now();
        h2d_total += std::chrono::duration<double>(end - start).count();

        // D2H test
        start = std::chrono::high_resolution_clock::now();
        cudaMemcpy(h_dst, d_src, bytes, cudaMemcpyDeviceToHost);
        cudaDeviceSynchronize();
        end = std::chrono::high_resolution_clock::now();
        d2h_total += std::chrono::duration<double>(end - start).count();

        // D2D test
        int blockSize = 256;
        int gridSize = (N + blockSize - 1) / blockSize;
        start = std::chrono::high_resolution_clock::now();
        memory_copy_kernel<<<gridSize, blockSize>>>(d_dst, d_src, N);
        cudaDeviceSynchronize();
        end = std::chrono::high_resolution_clock::now();
        d2d_total += std::chrono::duration<double>(end - start).count();
    }

    // Calculate averages
    double h2d_avg = h2d_total / num_runs;
    double d2h_avg = d2h_total / num_runs;
    double d2d_avg = d2d_total / num_runs;

    double h2d_bandwidth = (bytes / (1024.0*1024.0*1024.0)) / h2d_avg;
    double d2h_bandwidth = (bytes / (1024.0*1024.0*1024.0)) / d2h_avg;
    double d2d_bandwidth = (2 * bytes / (1024.0*1024.0*1024.0)) / d2d_avg;

    printf("\n=== RESULTS (Average of %d runs) ===\n", num_runs);
    printf("Host to Device:   %.2f GB/s\n", h2d_bandwidth);
    printf("Device to Host:   %.2f GB/s\n", d2h_bandwidth);
    printf("Device to Device: %.2f GB/s\n", d2d_bandwidth);

    // Cleanup
    free(h_src);
    free(h_dst);
    cudaFree(d_src);
    cudaFree(d_dst);

    return 0;
}

Overwriting memory_bandwidth.cu


In [78]:
!nvcc -arch=sm_75 memory_bandwidth.cu -o memory_bandwidth

In [79]:
!./memory_bandwidth

Memory Bandwidth Test (Small Version)
Data Size: 256.00 MB

=== RESULTS (Average of 5 runs) ===
Host to Device:   4.06 GB/s
Device to Host:   2.95 GB/s
Device to Device: 218.15 GB/s
