In [22]:
import sys
import os
import sqlite3
import pandas as pd
import numpy as np

# Will do a grid search across problem size parameters for each of the the 6 chosen benchmarks...
### For each run, will save down the sqlite table on my hard disk
#### Will also create a copy of the deltas between GPU page faults to use as input to ML model

In [23]:
MAX_BYTES_GPU = 25443893248

# Profiled Using: 
- ```nsys profile --stats=true --cuda-memory-usage=true --cuda-um-gpu-page-faults=true --cuda-um-cpu-page-faults=true --output=<output_filename> ./<executable_name> [<executable_args>]*```

In [24]:
PROFILE_COMMAND = "nsys profile --stats=true --cuda-memory-usage=true --cuda-um-gpu-page-faults=true --cuda-um-cpu-page-faults=true --force-overwrite=true --output="

In [25]:
OUTPUT_DIR = "/mnt/storage/data/research/gpu_traces/"

In [26]:
ADDRESS_SEQ_DIR = "/home/shein/Documents/research/prefetching/data/gpu_traces/"

In [52]:
def generate_address_sequence(benchmark, filename):
    db_filename = OUTPUT_DIR + benchmark + filename + ".sqlite"
    con = sqlite3.connect(db_filename)
    gpu_page_fault_df = pd.read_sql_query("SELECT address FROM CUDA_UM_GPU_PAGE_FAULT_EVENTS ORDER BY start", con)
    gpu_page_fault_df["address"] = gpu_page_fault_df["address"].astype(np.uint64)
    output_address_seq_filename = ADDRESS_SEQ_DIR + benchmark + filename + ".address_buffer"
    f = open(output_address_seq_filename, 'wb')
    for i in range(len(gpu_page_fault_df["address"])):
        val = gpu_page_fault_df["address"][i]
        f.write(val.tobytes())
    f.close()

In [53]:
os.chdir("/home/shein/Documents/research/prefetching/data/chosen_benchmarks/executables")

# *Streaming*

## 2DConv (RAN)

#### GPU Memory Usage: $8N_IN_J$ 

In [54]:
# Params
BENCHMARK = "2DCONV/"
EXECUTABLE_NAME = "./2DConvolution.exe"
NI = [100, 500, 1000, 2500, 5000, 10000, 25000, 50000, 56000]
NJ = [100, 500, 1000, 2500, 5000, 10000, 25000, 50000, 56000]

In [None]:
for i in NI:
    for j in NJ:
        filename = str(i) + "_" + str(j)
        command = PROFILE_COMMAND + OUTPUT_DIR + BENCHMARK + filename + " " + EXECUTABLE_NAME + " " + args
        os.system(command)
        generate_address_sequence(BENCHMARK, filename)

## 3DConv

#### GPU Memory Usage: $8N_IN_JN_K$ 

In [11]:
# Params
NI = 0
NJ = 0
NK = 0

# *Non-Streaming*

## ATAX

#### GPU Memory Usage: $4N_XN_Y + 8N_Y + 4N_X$ 

In [10]:
# Params
NX = 0
NY = 0

## MVT (RAN)

#### GPU Memory Usage: $4N(N+4)$ 

In [48]:
# Params
BENCHMARK = "MVT/"
EXECUTABLE_NAME = "mvt.exe"
N = [65000]

In [49]:
for n in N:
    args = str(n)
    filename = str(n)
    command = PROFILE_COMMAND + OUTPUT_DIR + BENCHMARK + filename + " " + EXECUTABLE_NAME + " " + args
    os.system(command)
    generate_address_sequence(BENCHMARK, filename)



setting device 0 with name NVIDIA GeForce RTX 3090
GPU Runtime: 42.838028s
CPU Runtime: 18.399707s
Non-Matching CPU-GPU Outputs Beyond Error Threshold of 0.05 Percent: 0
Generating '/tmp/nsys-report-bbbb.qdstrm'


SKIPPED: /mnt/storage/data/research/gpu_traces/MVT/65000.sqlite does not contain NV Tools Extension (NVTX) data.


[ 3/11] Executing 'nvtxsum' stats report
[ 4/11] Executing 'osrtsum' stats report

Operating System Runtime API Statistics:

 Time (%)  Total Time (ns)  Num Calls      Avg (ns)          Med (ns)         Min (ns)        Max (ns)       StdDev (ns)             Name         
 --------  ---------------  ---------  ----------------  ----------------  --------------  --------------  ---------------  ----------------------
     45.2  191,256,697,457     10,807      17,697,482.9      10,056,165.0           1,002     101,028,282     27,485,454.3  poll                  
     25.5  107,718,903,780         88   1,224,078,452.0     953,764,538.0      70,610,100  20,788,931,325  2,121,323,996.8  sem_wait              
     17.3   73,012,140,915        146     500,083,157.0     500,073,379.0     500,058,001     500,519,404         60,716.7  pthread_cond_timedwait
      6.8   28,984,046,463          1  28,984,046,463.0  28,984,046,463.0  28,984,046,463  28,984,046,463              0.0  pthread_cond_wai

## CORR

#### GPU Memory Usage: $8(M+1)(N+2)$ 

In [9]:
# Params
M = 0
N = 0

## COV

#### GPU Memory Usage: $8(M+1)(N+1) + 4(M+1)$ 

In [None]:
# Params
M = 0
N = 0