In [2]:
import sqlite3
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [3]:
TRACE_PATH = "../gpu_traces/"

## Get DB Containing Trace Details

In [4]:
BENCHMARK = "gemm"
BENCHMARK_PATH = TRACE_PATH + BENCHMARK + "/"

In [5]:
matrix_sizes = [512, 4096, 32768, 50000, 65536]

In [6]:
chosen_size = 32768

In [7]:
db_filename = BENCHMARK_PATH + str(chosen_size) + ".sqlite"

# Profiled Using: 
- ```nsys profile --stats=true --cuda-memory-usage=true --cuda-um-gpu-page-faults=true --cuda-um-cpu-page-faults=true --output=<output_filename> ./<executable_name>```

## Memory Transfer and Page Fault Tables
- CUPTI_ACTIVITY_KIND_MEMCPY
- CUDA_UM_GPU_PAGE_FAULT_EVENTS
- CUDA_GPU_MEMORY_USAGE_EVENTS
- CUDA_UM_CPU_PAGE_FAULT_EVENTS

### Other Possibly Relevant Tables
- ANALYSIS_DETAILS
- COMPOSITE_EVENTS
- CUPTI_ACTIVITY_KIND_KERNEL
- CUPTI_ACTIVITY_KIND_RUNTIME
- CUPTI_ACTIVITY_KIND_SYNCHRONIZATION
- OSRT_API
- OSRT_CALLCHAINS
- PROFILER_OVERHEAD
- SAMPLING_CALLCHAINS
- SCHED_EVENTS
- TARGET_INFO_CUDA_STREAM
- TARGET_INFO_GPU
- TARGET_INFO_SYSTEM_ENV

In [8]:
con = sqlite3.connect(db_filename)

In [9]:
analysis_overview = pd.read_sql_query("SELECT globalVid, startTime, stopTime, duration FROM ANALYSIS_DETAILS", con)

In [10]:
mem_transfer_df = pd.read_sql_query("SELECT start, end, virtualAddress, migrationCause, bytes, copyKind, srcKind, dstKind FROM CUPTI_ACTIVITY_KIND_MEMCPY ORDER BY start", con)

In [11]:
gpu_page_fault_df = pd.read_sql_query("SELECT start, end, address, numberOfPageFaults, faultAccessType FROM CUDA_UM_GPU_PAGE_FAULT_EVENTS ORDER BY start", con)

In [12]:
# gpu_memory_usage_df = pd.read_sql_query("SELECT start, address, bytes, memKind, memoryOperationType, correlationId FROM CUDA_GPU_MEMORY_USAGE_EVENTS", con)

In [13]:
# cpu_page_fault_df = pd.read_sql_query("SELECT start, address, originalFaultPc FROM CUDA_UM_CPU_PAGE_FAULT_EVENTS ORDER BY start", con)

In [14]:
# kernel_overview = pd.read_sql_query("SELECT start, end, gridX, gridY, gridZ, blockX, blockY, blockZ, registersPerThread, localMemoryTotal, sharedMemoryExecuted FROM CUPTI_ACTIVITY_KIND_KERNEL", con)

# Analyze the Trace

In [15]:
# runtime in seconds
runtime = analysis_overview["duration"] / 1e9

In [18]:
# migrationCause
# 0: Unknown
# 1: User (e.g. cudaMemPrefetchAsync)
# 2: Coherence (to guarantee data coherence for CPU/GPU)
# 3: Prefetch (driver enabled for performance)
# 4: Eviction from GPU (full)
mem_transfer_df.head()

Unnamed: 0,start,end,virtualAddress,migrationCause,bytes,copyKind,srcKind,dstKind
0,4876800429,4876802603,140656118857728,2,4096,11,7,7
1,4876802604,4876804460,140656118861824,3,4096,11,7,7
2,4876804461,4876806507,140656118865920,2,4096,11,7,7
3,4876806508,4876810284,140656118870016,3,53248,11,7,7
4,4876810285,4876812012,140656118988800,3,4096,11,7,7


In [17]:
gpu_page_fault_df.head()

Unnamed: 0,start,end,address,numberOfPageFaults,faultAccessType
0,4876422187,4876905901,140656118857728,3,1
1,4876908973,4877231535,140660413825024,7,1
2,4877233422,4877321231,140664708792320,31,1
3,4877322543,4877389935,140660413956096,13,1
4,4877390383,4877436496,140660414087168,3,1
