In [1]:
import sqlite3
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [38]:
TRACE_PATH = "/mnt/storage/data/research/gpu_traces/"

In [44]:
OUTPUT_MAIN_FOLDER = "../gpu_traces/"

## Get DB Containing Trace Details

In [39]:
BENCHMARK = "2DCONV"
BENCHMARK_PATH = TRACE_PATH + BENCHMARK + "/"

In [45]:
OUTPUT_DIR = OUTPUT_FOLDER + BENCHMARK + "/"

In [40]:
chosen_size = "5000_2500"

In [5]:
db_filename = BENCHMARK_PATH + str(chosen_size) + ".sqlite"

# Profiled Using: 
- ```nsys profile --stats=true --cuda-memory-usage=true --cuda-um-gpu-page-faults=true --cuda-um-cpu-page-faults=true --output=<output_filename> ./<executable_name>```

## Memory Transfer and Page Fault Tables
- CUPTI_ACTIVITY_KIND_MEMCPY
- CUDA_UM_GPU_PAGE_FAULT_EVENTS
- CUDA_GPU_MEMORY_USAGE_EVENTS
- CUDA_UM_CPU_PAGE_FAULT_EVENTS

### Other Possibly Relevant Tables
- ANALYSIS_DETAILS
- COMPOSITE_EVENTS
- CUPTI_ACTIVITY_KIND_KERNEL
- CUPTI_ACTIVITY_KIND_RUNTIME
- CUPTI_ACTIVITY_KIND_SYNCHRONIZATION
- OSRT_API
- OSRT_CALLCHAINS
- PROFILER_OVERHEAD
- SAMPLING_CALLCHAINS
- SCHED_EVENTS
- TARGET_INFO_CUDA_STREAM
- TARGET_INFO_GPU
- TARGET_INFO_SYSTEM_ENV

In [16]:
con = sqlite3.connect(db_filename)

In [17]:
analysis_overview = pd.read_sql_query("SELECT globalVid, startTime, stopTime, duration FROM ANALYSIS_DETAILS", con)

In [18]:
mem_transfer_df = pd.read_sql_query("SELECT start, end, virtualAddress, migrationCause, bytes, copyKind, srcKind, dstKind FROM CUPTI_ACTIVITY_KIND_MEMCPY ORDER BY start", con)

In [19]:
gpu_page_fault_df = pd.read_sql_query("SELECT start, end, address, numberOfPageFaults, faultAccessType FROM CUDA_UM_GPU_PAGE_FAULT_EVENTS ORDER BY start", con)

In [10]:
# gpu_memory_usage_df = pd.read_sql_query("SELECT start, address, bytes, memKind, memoryOperationType, correlationId FROM CUDA_GPU_MEMORY_USAGE_EVENTS", con)

In [11]:
# cpu_page_fault_df = pd.read_sql_query("SELECT start, address, originalFaultPc FROM CUDA_UM_CPU_PAGE_FAULT_EVENTS ORDER BY start", con)

In [12]:
# kernel_overview = pd.read_sql_query("SELECT start, end, gridX, gridY, gridZ, blockX, blockY, blockZ, registersPerThread, localMemoryTotal, sharedMemoryExecuted FROM CUPTI_ACTIVITY_KIND_KERNEL", con)

# Analyze the Trace

In [20]:
# runtime in seconds
runtime = analysis_overview["duration"] / 1e9

In [21]:
# migrationCause
# 0: Unknown
# 1: User (e.g. cudaMemPrefetchAsync)
# 2: Coherence (to guarantee data coherence for CPU/GPU)
# 3: Prefetch (driver enabled for performance)
# 4: Eviction from GPU (full)
mem_transfer_df.head()

Unnamed: 0,start,end,virtualAddress,migrationCause,bytes,copyKind,srcKind,dstKind
0,446167008,446170783,140621222313984,2,49152,11,7,7
1,446170784,446172672,140621222367232,2,12288,11,7,7
2,446284704,446287199,140621222248448,2,20480,11,7,7
3,446287200,446289919,140621222273024,2,32768,11,7,7
4,446289920,446291487,140621222309888,2,4096,11,7,7


In [22]:
# faultAccessType
# 0: Unknown
# 1: Read
# 2: Write
# 3: Atomic
# 4: Prefetch
gpu_page_fault_df.head()

Unnamed: 0,start,end,address,numberOfPageFaults,faultAccessType
0,445847230,446215200,140621222313984,53,1
1,446217952,446314048,140621222248448,5,1
2,446315648,446441729,140621222268928,42,1
3,446443361,446571842,140621222420480,10,1
4,446574498,446711395,140621155151872,1,2


In [69]:
gpu_page_fault_df["delta"] = (gpu_page_fault_df["address"] - gpu_page_fault_df["address"].shift(1)).fillna(0).astype(np.int64)

In [53]:
gpu_page_fault_df["address"] = gpu_page_fault_df["address"].astype(np.uint64)

In [49]:
OUTPUT_FILENAME_DELTA = OUTPUT_DIR + chosen_size + ".delta_buffer"
OUTPUT_FILENAME_ADDRESS = OUTPUT_DIR +  chosen_size + ".address_buffer" 

In [50]:
OUTPUT_FILENAME_DELTA

'../gpu_traces/2DCONV/5000_2500.delta_buffer'

In [66]:
# writing to buffer as int64
f = open(OUTPUT_FILENAME_DELTA, 'wb')
for i in range(len(gpu_page_fault_df["delta"])):
    val = gpu_page_fault_df["delta"][i]
    f.write(val.tobytes())
f.close()

In [67]:
# writing to buffer as uint64
f = open(OUTPUT_FILENAME_ADDRESS, 'wb')
for i in range(len(gpu_page_fault_df["address"])):
    val = gpu_page_fault_df["address"][i]
    f.write(val.tobytes())
f.close()

In [68]:
gpu_page_fault_df["delta"]

0             0
1         65536
2        -20480
3       -151552
4      67268608
         ...   
317      -98304
318      -65536
319     -139264
320      -98304
321      -81920
Name: delta, Length: 322, dtype: int64