# üîçSanity Check

This notebook verifies that the Dev Container (Python 3.13 + CUDA 13) can access the GPU through PyTorch and perform GPU accelerated computations via RAPIDS.

### GPU, CUDA, RAPIDS Sanity Check

In [None]:
# must be BEFORE importing pandas
%load_ext cudf.pandas

In [None]:
%lsmagic

In [None]:
import IPython
import pandas as pd
import cudf
import cudf.pandas
import cupy
import torch
import platform

print("Python:                  ", platform.python_version())
print("IPython:                 ", IPython.__version__)
print("Magics present:          ","cudf.pandas.profile" in get_ipython().magics_manager.magics['cell'])
print()

if torch.cuda.is_available():
    print("GPU:                     ", torch.cuda.get_device_name(0))
    print("CUDA runtime version:    ", torch.version.cuda)
    print("cuDF:                    ", cudf.__version__)
    print("cudf.pandas installed:   ", hasattr(cudf.pandas, "install"))
    print("cupy config:             ", cupy.show_config())

    print()

    print("PyTorch:                 ", torch.__version__)
    print("CUDA available (torch):  ", torch.cuda.is_available())

### cuDF

In [None]:
%%cudf.pandas.profile

# cuDF accelerated data frame
df = cudf.DataFrame({"a": range(1_000_000)})
print("sum (cuDF):", int(df["a"].sum()))

In [None]:
%%cudf.pandas.profile

df = pd.DataFrame({"a": range(1_000_000)})
print("sum (pandas-accelerated):", df["a"].sum())

In [None]:
%%cudf.pandas.profile

# cuDF implicit via pandas-accelerated
print("pandas version:", pd.__version__)
df = pd.DataFrame({"x": range(10_000_000)})
_ = df.groupby(pd.cut(df["x"], 100)).x.mean()

### cuML

In [None]:
import time
import cupy as cp
import statistics as stats

from cuml.model_selection import train_test_split
from cuml.linear_model import LogisticRegression

#### Benchmark Helper

In [None]:
def benchmark_gpu(fn, *args, label=None, **kwargs):
    """
    Benchmark a GPU function call using both CPU wall time and CUDA events.

    Args:
        fn: Callable to benchmark (e.g., clf.fit)
        *args, **kwargs: Arguments to pass to fn
        label: Optional string for print labeling

    Returns:
        result: return value from fn
        metrics: dict with 'cpu_ms', 'gpu_ms', 'overhead_ms'
    """
    start_gpu = cp.cuda.Event()
    end_gpu = cp.cuda.Event()
    cp.cuda.Device().synchronize()

    start_cpu = time.perf_counter()
    start_gpu.record()

    result = fn(*args, **kwargs)

    end_gpu.record()
    end_gpu.synchronize()
    end_cpu = time.perf_counter()

    cpu_ms = (end_cpu - start_cpu) * 1000
    gpu_ms = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
    overhead_ms = cpu_ms - gpu_ms

    if label:
        print(f"\n[{label}]")
    print(f"CPU wall time : {cpu_ms:8.3f} ms")
    print(f"GPU kernel time: {gpu_ms:8.3f} ms")
    print(f"Overhead      : {overhead_ms:8.3f} ms")

    return result, {"cpu_ms": cpu_ms, "gpu_ms": gpu_ms, "overhead_ms": overhead_ms}

#### cuML Benchmark on GPU

In [None]:
# Data generation
N, D = 200_000, 20
cp.random.seed(42)

X = cp.random.rand(N, D, dtype=cp.float32)
y = (X.sum(axis=1) > 10).astype(cp.int32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Logistic Regression fit benchmark
clf = LogisticRegression(max_iter=100)
_, fit_metrics = benchmark_gpu(clf.fit, X_train, y_train, label="Fit")

# Logistic Regression predict benchmark
y_pred, pred_metrics = benchmark_gpu(clf.predict, X_test, label="Predict")
acc = float((y_pred == y_test).mean())
print(f"\nAccuracy: {acc:.4f}")

cpu_times, gpu_times, overhead_times = [], [], []

# Repeat multiple runs (skip first for warm-up)
for i in range(6):
    _, metrics = benchmark_gpu(clf.fit, X_train, y_train, label=f"Fit {i}")
    if i == 0:
        continue  # skip warm-up
    cpu_times.append(metrics["cpu_ms"])
    gpu_times.append(metrics["gpu_ms"])
    overhead_times.append(metrics["overhead_ms"])

def summarize(label, data):
    return (f"{label:>10}: "
            f"mean={stats.mean(data):8.3f} ms | "
            f"median={stats.median(data):8.3f} ms | "
            f"min={min(data):8.3f} | max={max(data):8.3f}")

print(f"\nResults over {len(cpu_times)} runs (excluding warm-up):")
print(summarize("CPU", cpu_times))
print(summarize("GPU", gpu_times))
print(summarize("Overhead", overhead_times))

### Torch Sanity Check

In [None]:
# Run a small GPU tensor operation to confirm GPU compute works
if torch.cuda.is_available():
    x = torch.rand((10000, 10000), device="cuda")
    y = torch.rand((10000, 10000), device="cuda")
    z = torch.mm(x, y)
    print("‚úÖ GPU matrix multiplication completed successfully.")
    del x, y, z
else:
    print("‚ö†Ô∏è  Skipping GPU compute test (CUDA not available).")