# CANデータGPU処理ベンチマーク（プロファイリング版）

メモリプロファイリングとGPU使用率モニタリングを含む詳細なベンチマーク

## 1. 環境設定とプロファイリングツールのセットアップ

In [None]:
import os
import numpy as np
import pandas as pd
import cudf
import cupy as cp
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc
import psutil
import rmm
from datetime import datetime
import threading
from collections import deque

# GPU monitoring tools
try:
    import pynvml
except ImportError:
    !pip install nvidia-ml-py3
    import pynvml

# Memory profiling
from rmm.statistics import ProfilerRecords, statistics

# Import decoders
from gpu_can_decoder import GPUCANDecoder
from cpu_can_decoder import CPUCANDecoder

# Initialize NVML for GPU monitoring
pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)

# Get GPU info
gpu_name = pynvml.nvmlDeviceGetName(gpu_handle).decode('utf-8')
gpu_mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
print(f"GPU: {gpu_name}")
print(f"GPU Memory: {gpu_mem_info.total / (1024**3):.1f} GB")

# Plot settings
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12
sns.set_style("whitegrid")

## 2. GPU使用率モニタリングクラス

In [None]:
class GPUMonitor:
    """GPU使用率とメモリ使用量をモニタリング"""
    
    def __init__(self, interval=0.1):
        self.interval = interval
        self.gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        self.monitoring = False
        self.timestamps = deque(maxlen=10000)
        self.gpu_utils = deque(maxlen=10000)
        self.mem_utils = deque(maxlen=10000)
        self.mem_used = deque(maxlen=10000)
        self.thread = None
    
    def _monitor_loop(self):
        """モニタリングループ"""
        start_time = time.time()
        while self.monitoring:
            current_time = time.time() - start_time
            
            # GPU使用率
            util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handle)
            self.gpu_utils.append(util.gpu)
            
            # メモリ使用量
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handle)
            self.mem_utils.append(100 * mem_info.used / mem_info.total)
            self.mem_used.append(mem_info.used / (1024**3))  # GB
            
            self.timestamps.append(current_time)
            time.sleep(self.interval)
    
    def start(self):
        """モニタリング開始"""
        self.monitoring = True
        self.thread = threading.Thread(target=self._monitor_loop)
        self.thread.start()
    
    def stop(self):
        """モニタリング停止"""
        self.monitoring = False
        if self.thread:
            self.thread.join()
    
    def get_data(self):
        """データ取得"""
        return {
            'timestamps': list(self.timestamps),
            'gpu_utils': list(self.gpu_utils),
            'mem_utils': list(self.mem_utils),
            'mem_used_gb': list(self.mem_used)
        }
    
    def plot(self, title="GPU Monitoring Results"):
        """結果のプロット"""
        data = self.get_data()
        if not data['timestamps']:
            print("No monitoring data available")
            return
        
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
        
        # GPU使用率
        ax1.plot(data['timestamps'], data['gpu_utils'], 'b-', linewidth=1.5)
        ax1.set_ylabel('GPU Utilization (%)')
        ax1.set_ylim(0, 105)
        ax1.grid(True, alpha=0.3)
        ax1.set_title(f"{title} - GPU Utilization")
        
        # メモリ使用量
        ax2.plot(data['timestamps'], data['mem_used_gb'], 'r-', linewidth=1.5, label='Used')
        ax2.axhline(y=24, color='k', linestyle='--', alpha=0.5, label='Total (24GB)')
        ax2.set_xlabel('Time (seconds)')
        ax2.set_ylabel('GPU Memory (GB)')
        ax2.set_ylim(0, 26)
        ax2.grid(True, alpha=0.3)
        ax2.legend()
        ax2.set_title("GPU Memory Usage")
        
        plt.tight_layout()
        plt.show()
        
        # 統計情報
        print(f"\nGPU Utilization Statistics:")
        print(f"  Average: {np.mean(data['gpu_utils']):.1f}%")
        print(f"  Maximum: {np.max(data['gpu_utils']):.1f}%")
        print(f"  Minimum: {np.min(data['gpu_utils']):.1f}%")
        
        print(f"\nGPU Memory Statistics:")
        print(f"  Average: {np.mean(data['mem_used_gb']):.2f} GB")
        print(f"  Maximum: {np.max(data['mem_used_gb']):.2f} GB")
        print(f"  Minimum: {np.min(data['mem_used_gb']):.2f} GB")

# モニターのテスト
monitor = GPUMonitor(interval=0.05)
print("GPU Monitor initialized")

## 3. RMM メモリプロファイリング設定

In [None]:
# RMM statistics を有効化
rmm.statistics.enable_statistics()

# RMM を再初期化（統計機能付き）
rmm.reinitialize(
    managed_memory=False,  # まずは通常のGPUメモリで試す
    pool_allocator=True,
    initial_pool_size=2<<30,    # 2GB
    maximum_pool_size=22<<30,   # 22GB (24GBの GPU用)
    logging=True
)

def print_rmm_statistics():
    """RMM統計情報を表示"""
    stats = rmm.statistics.get_statistics()
    print("\nRMM Memory Statistics:")
    print(f"  Current allocated: {stats.current_bytes / (1024**3):.2f} GB")
    print(f"  Peak allocated: {stats.peak_bytes / (1024**3):.2f} GB")
    print(f"  Total allocations: {stats.n_allocations}")
    print(f"  Total deallocations: {stats.n_deallocations}")

# CuPy メモリプール情報
def print_cupy_memory_info():
    """CuPyメモリプール情報を表示"""
    mempool = cp.get_default_memory_pool()
    print("\nCuPy Memory Pool:")
    print(f"  Used: {mempool.used_bytes() / (1024**3):.2f} GB")
    print(f"  Total: {mempool.total_bytes() / (1024**3):.2f} GB")

print("Memory profiling setup complete")
print_rmm_statistics()
print_cupy_memory_info()

## 4. プロファイリング付きデータ生成関数

In [None]:
def generate_synthetic_can_data_profiled(n_messages):
    """プロファイリング付きCANデータ生成"""
    # リアルなCANデータ分布
    address_distribution = {
        170: 0.037,  # 4輪速度
        37: 0.037,   # ステアリング
        36: 0.037,
        740: 0.044,
        608: 0.022,
        180: 0.018,
    }
    
    print(f"\nGenerating {n_messages:,} messages...")
    start_time = time.time()
    
    # アドレスを生成
    addresses = []
    for addr, prob in address_distribution.items():
        count = int(n_messages * prob)
        addresses.extend([addr] * count)
    
    # 残りはランダムなアドレス
    remaining = n_messages - len(addresses)
    other_addresses = np.random.choice([452, 466, 467, 705, 321, 562], remaining)
    addresses.extend(other_addresses)
    
    # シャッフル
    np.random.shuffle(addresses)
    addresses = np.array(addresses[:n_messages], dtype=np.int64)
    
    # タイムスタンプ（約60秒間）
    timestamps = np.linspace(46408.0, 46468.0, n_messages)
    
    # データバイト
    data_bytes = np.zeros((n_messages, 8), dtype=np.uint8)
    
    for i in range(n_messages):
        if addresses[i] == 170:  # 4輪速度
            for j in range(4):
                speed_kmh = np.random.uniform(55, 65)
                raw_value = int((speed_kmh + 67.67) / 0.01)
                data_bytes[i, j*2] = (raw_value >> 8) & 0xFF
                data_bytes[i, j*2 + 1] = raw_value & 0xFF
        elif addresses[i] == 37:  # ステアリング
            data_bytes[i] = [0x00, 0x00, 0x10, 0x00, 0xC0, 0x00, 0x00, 0xFD]
        else:
            data_bytes[i] = np.random.randint(0, 256, 8, dtype=np.uint8)
    
    gen_time = time.time() - start_time
    data_size_mb = (timestamps.nbytes + addresses.nbytes + data_bytes.nbytes) / (1024**2)
    
    print(f"  Generation time: {gen_time:.2f} seconds")
    print(f"  Data size: {data_size_mb:.1f} MB")
    print(f"  Throughput: {n_messages / gen_time / 1e6:.1f} Mmessages/sec")
    
    return timestamps, addresses, data_bytes

## 5. プロファイリング付きベンチマーク実行

In [None]:
# デコーダーの初期化
gpu_decoder = GPUCANDecoder(batch_size=1_000_000)
cpu_decoder = CPUCANDecoder()

# テストサイズ
test_sizes = [100_000, 1_000_000, 10_000_000]
benchmark_results = []

for n_messages in test_sizes:
    print(f"\n{'='*60}")
    print(f"Testing with {n_messages:,} messages")
    print(f"{'='*60}")
    
    # メモリクリア
    gc.collect()
    cp.get_default_memory_pool().free_all_blocks()
    
    # 初期メモリ状態
    print("\nInitial memory state:")
    print_rmm_statistics()
    print_cupy_memory_info()
    
    # データ生成
    timestamps, addresses, data_bytes = generate_synthetic_can_data_profiled(n_messages)
    
    # GPU処理（モニタリング付き）
    print("\n--- GPU Processing ---")
    monitor.start()
    
    gpu_start = time.time()
    with statistics.profiler(name="GPU_decode"):
        gpu_results = gpu_decoder.decode_batch(timestamps, addresses, data_bytes)
        cp.cuda.Stream.null.synchronize()
    gpu_time = time.time() - gpu_start
    
    monitor.stop()
    
    # GPU結果の統計
    n_decoded_gpu = sum(len(df) for df in gpu_results.values() if df is not None)
    
    print(f"\nGPU Results:")
    print(f"  Processing time: {gpu_time:.3f} seconds")
    print(f"  Throughput: {n_messages / gpu_time / 1e6:.1f} Mmessages/sec")
    print(f"  Decoded messages: {n_decoded_gpu:,}")
    
    # メモリ状態
    print("\nPost-GPU memory state:")
    print_rmm_statistics()
    print_cupy_memory_info()
    
    # GPU使用率のプロット
    monitor.plot(title=f"GPU Processing - {n_messages:,} messages")
    
    # CPU処理（小さいデータセットのみ）
    if n_messages <= 1_000_000:
        print("\n--- CPU Processing ---")
        cpu_start = time.time()
        cpu_results = cpu_decoder.decode_batch(timestamps, addresses, data_bytes)
        cpu_time = time.time() - cpu_start
        
        n_decoded_cpu = sum(len(df) for df in cpu_results.values() if df is not None)
        
        print(f"\nCPU Results:")
        print(f"  Processing time: {cpu_time:.3f} seconds")
        print(f"  Throughput: {n_messages / cpu_time / 1e6:.1f} Mmessages/sec")
        print(f"  Decoded messages: {n_decoded_cpu:,}")
        print(f"\nSpeedup: {cpu_time / gpu_time:.1f}x")
    else:
        cpu_time = None
        print("\nCPU processing skipped for large dataset")
    
    # 結果記録
    result = {
        'n_messages': n_messages,
        'gpu_time': gpu_time,
        'cpu_time': cpu_time,
        'speedup': cpu_time / gpu_time if cpu_time else None,
        'gpu_throughput_mmsg': n_messages / gpu_time / 1e6,
        'n_decoded_gpu': n_decoded_gpu,
        'gpu_util_avg': np.mean(monitor.get_data()['gpu_utils']),
        'gpu_util_max': np.max(monitor.get_data()['gpu_utils']),
        'mem_used_avg_gb': np.mean(monitor.get_data()['mem_used_gb']),
        'mem_used_max_gb': np.max(monitor.get_data()['mem_used_gb'])
    }
    benchmark_results.append(result)
    
    # クリーンアップ
    del timestamps, addresses, data_bytes, gpu_results
    if cpu_time:
        del cpu_results
    gc.collect()
    cp.get_default_memory_pool().free_all_blocks()

## 6. 結果の分析と可視化

In [None]:
# 結果をDataFrameに
benchmark_df = pd.DataFrame(benchmark_results)

print("\n" + "="*60)
print("BENCHMARK SUMMARY")
print("="*60)
print(benchmark_df.to_string(index=False))

# パフォーマンス分析のプロット
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# GPU使用率 vs データサイズ
ax1.plot(benchmark_df['n_messages'], benchmark_df['gpu_util_avg'], 'b-o', 
         label='Average', linewidth=2, markersize=8)
ax1.plot(benchmark_df['n_messages'], benchmark_df['gpu_util_max'], 'r--o', 
         label='Maximum', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Messages')
ax1.set_ylabel('GPU Utilization (%)')
ax1.set_title('GPU Utilization vs Data Size')
ax1.set_xscale('log')
ax1.legend()
ax1.grid(True, alpha=0.3)

# メモリ使用量 vs データサイズ
ax2.plot(benchmark_df['n_messages'], benchmark_df['mem_used_avg_gb'], 'g-o', 
         label='Average', linewidth=2, markersize=8)
ax2.plot(benchmark_df['n_messages'], benchmark_df['mem_used_max_gb'], 'm--o', 
         label='Maximum', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Messages')
ax2.set_ylabel('GPU Memory (GB)')
ax2.set_title('GPU Memory Usage vs Data Size')
ax2.set_xscale('log')
ax2.legend()
ax2.grid(True, alpha=0.3)

# スループット vs データサイズ
ax3.plot(benchmark_df['n_messages'], benchmark_df['gpu_throughput_mmsg'], 'c-o', 
         linewidth=2, markersize=8)
ax3.set_xlabel('Number of Messages')
ax3.set_ylabel('Throughput (Mmessages/sec)')
ax3.set_title('GPU Throughput Scaling')
ax3.set_xscale('log')
ax3.grid(True, alpha=0.3)

# 処理時間のスケーリング
ax4.plot(benchmark_df['n_messages'], benchmark_df['gpu_time'], 'b-o', 
         label='GPU', linewidth=2, markersize=8)
if benchmark_df['cpu_time'].notna().any():
    mask = benchmark_df['cpu_time'].notna()
    ax4.plot(benchmark_df.loc[mask, 'n_messages'], 
             benchmark_df.loc[mask, 'cpu_time'], 
             'r--o', label='CPU', linewidth=2, markersize=8)
ax4.set_xlabel('Number of Messages')
ax4.set_ylabel('Processing Time (seconds)')
ax4.set_title('Processing Time Scaling')
ax4.set_xscale('log')
ax4.set_yscale('log')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. ボトルネック分析

In [None]:
print("=== Performance Bottleneck Analysis ===")

# GPU使用率の分析
avg_gpu_util = benchmark_df['gpu_util_avg'].mean()
print(f"\n1. GPU Utilization:")
print(f"   Average across all tests: {avg_gpu_util:.1f}%")

if avg_gpu_util < 50:
    print("   ⚠️  Low GPU utilization detected!")
    print("   Potential causes:")
    print("   - Small batch sizes")
    print("   - CPU-GPU data transfer overhead")
    print("   - Kernel launch overhead")
    print("   Recommendations:")
    print("   - Increase batch size")
    print("   - Use pinned memory for transfers")
    print("   - Implement kernel fusion")
elif avg_gpu_util < 80:
    print("   ⚠️  Moderate GPU utilization")
    print("   Room for optimization exists")
else:
    print("   ✓ Good GPU utilization")

# メモリ帯域幅の推定
print(f"\n2. Memory Bandwidth Analysis:")
for idx, row in benchmark_df.iterrows():
    data_size_gb = (row['n_messages'] * 24) / (1024**3)
    bandwidth_gb = data_size_gb / row['gpu_time']
    theoretical_max = 900  # GB/s for modern GPUs
    efficiency = (bandwidth_gb / theoretical_max) * 100
    
    print(f"   {row['n_messages']:,} messages:")
    print(f"     Achieved: {bandwidth_gb:.1f} GB/s")
    print(f"     Efficiency: {efficiency:.1f}%")

# カーネル効率の分析
print(f"\n3. Kernel Efficiency:")
print(f"   Messages per kernel launch: {gpu_decoder.batch_size:,}")
print(f"   Estimated kernel launches:")
for idx, row in benchmark_df.iterrows():
    n_launches = (row['n_messages'] + gpu_decoder.batch_size - 1) // gpu_decoder.batch_size
    print(f"     {row['n_messages']:,} messages: {n_launches} launches")

# 推奨事項
print(f"\n4. Optimization Recommendations:")
print("   Based on the profiling results:")

if avg_gpu_util < 50:
    print("   - Priority: Increase GPU utilization")
    print("   - Consider using CUDA streams for overlap")
    print("   - Implement double buffering")

if benchmark_df['mem_used_max_gb'].max() > 20:
    print("   - Memory usage is high, consider:")
    print("     - Streaming processing for large datasets")
    print("     - More aggressive memory pooling")

print("\n   - General optimizations:")
print("     - Use CUDA graphs for kernel launch overhead reduction")
print("     - Implement kernel fusion for related operations")
print("     - Consider mixed precision (FP16) where applicable")

## 8. 詳細なRMMプロファイリング結果

In [None]:
# RMM統計の詳細表示
print("=== Detailed RMM Memory Profile ===")

# プロファイラーレコードの取得
records = statistics.get_profiler_records()

if records:
    # レコードをDataFrameに変換
    profile_data = []
    for record in records:
        profile_data.append({
            'name': record.name,
            'num_calls': record.num_calls,
            'total_bytes': record.total_bytes / (1024**3),  # GB
            'peak_bytes': record.peak_bytes / (1024**3),    # GB
            'avg_bytes': (record.total_bytes / record.num_calls / (1024**3)) if record.num_calls > 0 else 0
        })
    
    profile_df = pd.DataFrame(profile_data)
    profile_df = profile_df.sort_values('total_bytes', ascending=False)
    
    print("\nMemory allocation by operation:")
    print(profile_df.to_string(index=False))
    
    # 可視化
    if len(profile_df) > 0:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Total bytes by operation
        profile_df.plot(x='name', y='total_bytes', kind='bar', ax=ax1)
        ax1.set_ylabel('Total Memory (GB)')
        ax1.set_title('Total Memory Allocation by Operation')
        ax1.tick_params(axis='x', rotation=45)
        
        # Peak bytes by operation
        profile_df.plot(x='name', y='peak_bytes', kind='bar', ax=ax2, color='orange')
        ax2.set_ylabel('Peak Memory (GB)')
        ax2.set_title('Peak Memory Usage by Operation')
        ax2.tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
else:
    print("No profiling records available")

# 最終的なメモリ状態
print("\n=== Final Memory State ===")
print_rmm_statistics()
print_cupy_memory_info()

# GPU情報の最終確認
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
print(f"\nGPU Memory (NVML):")
print(f"  Total: {mem_info.total / (1024**3):.1f} GB")
print(f"  Used: {mem_info.used / (1024**3):.2f} GB")
print(f"  Free: {mem_info.free / (1024**3):.2f} GB")