# Install cuplyr on Google Colab

GPU-accelerated dplyr for R, installed in ~5 minutes.

## Before running:
1. **Runtime → Change runtime type**
2. Set **Runtime type = R**, **Hardware accelerator = T4 GPU**
3. Click **Save**, then **Run all**

## Step 1: Pre-flight check

In [None]:
# Verify GPU is attached
gpu_check <- system('nvidia-smi -L', intern = TRUE)
if (length(gpu_check) == 0 || grepl('error|fail', gpu_check[1], ignore.case = TRUE)) {
  stop('No GPU detected! Go to Runtime -> Change runtime type -> GPU')
}
cat('GPU:', gpu_check[1], '\n')

# Find CUDA
for (p in c('/usr/local/cuda', '/usr/local/cuda-12.8', '/usr/local/cuda-12.4', '/usr/local/cuda-12')) {
  if (file.exists(file.path(p, 'include', 'cuda.h'))) {
    Sys.setenv(CUDA_HOME = p)
    break
  }
}
cat('CUDA:', Sys.getenv('CUDA_HOME'), '\n')

# Set up C++20 for R
r_makevars <- path.expand('~/.R/Makevars')
dir.create(dirname(r_makevars), showWarnings = FALSE, recursive = TRUE)
writeLines(c('CXX20=g++', 'CXX20STD=-std=gnu++20', 'CXX20FLAGS=-O2 -fPIC'), r_makevars)

## Step 2: Install mamba (package manager for RAPIDS)

In [None]:
# Install miniforge (provides mamba)
miniforge_dir <- '/opt/miniforge'
mamba <- file.path(miniforge_dir, 'bin', 'mamba')

if (!file.exists(mamba)) {
  cat('Installing Miniforge...\n')
  system('wget -q https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O /tmp/miniforge.sh')
  system(sprintf('bash /tmp/miniforge.sh -b -p %s 2>&1', miniforge_dir))
}

stopifnot('Miniforge install failed' = file.exists(mamba))

# Add mamba to PATH so install_cuplyr() can find it
old_path <- Sys.getenv('PATH')
Sys.setenv(PATH = paste(file.path(miniforge_dir, 'bin'), old_path, sep = ':'))
cat('mamba ready\n')

## Step 3: Install cuplyr and configure environment

This clones cuplyr, installs RAPIDS via conda, builds the package, and configures R to use the RAPIDS libraries. Takes 3-5 minutes on first run.

In [None]:
# Clone cuplyr install branch from GitHub
repo_dir <- '/content/cuplyr'
repo_branch <- 'install'
if (dir.exists(repo_dir)) unlink(repo_dir, recursive = TRUE)

status <- system2('git', c('clone', '--depth', '1', '-b', repo_branch,
                           'https://github.com/bbtheo/cuplyr.git', repo_dir),
                  stdout = FALSE, stderr = FALSE)
if (status != 0) {
  stop('Git clone failed. Check network connection or branch name.', call. = FALSE)
}

# Source the install functions
source(file.path(repo_dir, 'R', 'check-deps.R'))
source(file.path(repo_dir, 'R', 'install.R'))

# Run installer with Colab-specific settings
Sys.setenv(CUPLYR_ENV = 'colab')
install_cuplyr(
  repo = repo_dir,
  method = 'conda',
  conda_prefix = '/opt/rapids',
  verbose = TRUE
)

cat('\n=== Configuring R startup environment ===\n\n')

# Write .Renviron to set library paths when R starts
renviron <- path.expand('~/.Renviron')
env_lines <- c(
  '# cuplyr environment configuration',
  'LD_LIBRARY_PATH="/opt/rapids/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"',
  'R_LD_LIBRARY_PATH="/opt/rapids/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${R_LD_LIBRARY_PATH}"'
)

# Preserve existing .Renviron content if it exists
if (file.exists(renviron)) {
  existing <- readLines(renviron, warn = FALSE)
  # Remove old cuplyr config if present
  existing <- existing[!grepl('cuplyr environment configuration|^LD_LIBRARY_PATH=.*rapids|^R_LD_LIBRARY_PATH=.*rapids', existing)]
  if (length(existing) > 0 && !all(existing == "")) {
    env_lines <- c(existing, "", env_lines)
  }
}

writeLines(env_lines, renviron)
cat('✓ Wrote', renviron, '\n')
cat('  RAPIDS libraries will be loaded first when R starts\n\n')

# Also configure R's library config (in case .Renviron isn't processed)
r_ldpaths <- '/usr/local/lib/R/etc/ldpaths'
if (file.exists(dirname(r_ldpaths))) {
  ld_content <- c(
    ': ${R_LD_LIBRARY_PATH=/opt/rapids/lib:/usr/local/nvidia/lib64}',
    'export LD_LIBRARY_PATH',
    'export R_LD_LIBRARY_PATH'
  )
  write_ok <- tryCatch({
    writeLines(ld_content, r_ldpaths)
    TRUE
  }, error = function(e) FALSE)
  
  if (write_ok) {
    cat('✓ Wrote', r_ldpaths, '\n\n')
  }
}

cat('╔════════════════════════════════════════════════════════════╗\n')
cat('║                                                            ║\n')
cat('║  Installation complete! Please RESTART THE RUNTIME now.   ║\n')
cat('║                                                            ║\n')
cat('║  Go to: Runtime → Restart runtime                         ║\n')
cat('║                                                            ║\n')
cat('║  After restart, run the cells below to use cuplyr.        ║\n')
cat('║                                                            ║\n')
cat('╚════════════════════════════════════════════════════════════╝\n')

# Store repo_dir for later cells
writeLines(repo_dir, '/tmp/cuplyr_repo_dir.txt')

---

## ⚠️ RESTART RUNTIME NOW

**Go to: Runtime → Restart runtime**

Then run the cells below.

---

## Step 4: Load cuplyr (after runtime restart)

In [None]:
# Verify environment is configured correctly
cat('Checking environment...\n')
ld_path <- Sys.getenv('LD_LIBRARY_PATH')
if (!nzchar(ld_path)) {
  stop(
    'LD_LIBRARY_PATH not set.\n',
    'Did you restart the runtime? Go to: Runtime → Restart runtime\n',
    'Then run this cell again.',
    call. = FALSE
  )
}

ld_parts <- strsplit(ld_path, ':', fixed = TRUE)[[1]]
if (ld_parts[1] != '/opt/rapids/lib') {
  warning(
    'LD_LIBRARY_PATH does not start with /opt/rapids/lib.\n',
    'First entry: ', ld_parts[1], '\n',
    'This may cause library loading issues.'
  )
}

cat('  ✓ LD_LIBRARY_PATH:', ld_parts[1], '\n\n')

# Load cuplyr
cat('Loading cuplyr...\n')
library(cuplyr)
cat('  ✓ cuplyr loaded successfully!\n\n')

# Verify installation
repo_dir_file <- '/tmp/cuplyr_repo_dir.txt'
if (file.exists(repo_dir_file)) {
  repo_dir <- readLines(repo_dir_file, n = 1, warn = FALSE)
  if (file.exists(file.path(repo_dir, 'R', 'verify.R'))) {
    source(file.path(repo_dir, 'R', 'verify.R'))
    verify_installation()
  }
} else {
  cat('Note: Skipping verify_installation() (source files not found)\n')
  cat('Testing basic functionality...\n')
  test_result <- tryCatch({
    gpu <- tbl_gpu(data.frame(x = 1:5))
    result <- collect(gpu)
    cat('  ✓ Basic GPU operations work\n')
    TRUE
  }, error = function(e) {
    cat('  ✗ Basic GPU test failed:', conditionMessage(e), '\n')
    FALSE
  })
}

## Step 5: Try cuplyr

GPU-accelerated dplyr — same syntax, runs on GPU.

In [None]:
# Filter, mutate, select, arrange
tbl_gpu(mtcars) |>
  filter(mpg > 20) |>
  mutate(kpl = mpg * 0.425) |>
  select(mpg, kpl, cyl, hp) |>
  arrange(desc(mpg)) |>
  collect()

In [None]:
# Group by and summarise
tbl_gpu(mtcars) |>
  group_by(cyl) |>
  summarise(
    count = n(),
    avg_mpg = mean(mpg),
    avg_hp = mean(hp)
  ) |>
  arrange(cyl) |>
  collect()

In [None]:
# Lazy evaluation with query optimization
tbl_gpu(mtcars, lazy = TRUE) |>
  filter(mpg > 15) |>
  mutate(power_weight = hp / wt) |>
  group_by(cyl) |>
  summarise(avg_pw = mean(power_weight)) |>
  arrange(desc(avg_pw)) |>
  collect()

## Troubleshooting

Run `diagnostics()` if GPU is not detected — paste the output in a [GitHub issue](https://github.com/bbtheo/cuplyr/issues).

In [None]:
# Full diagnostics — paste this output in bug reports
repo_dir_file <- '/tmp/cuplyr_repo_dir.txt'
if (file.exists(repo_dir_file)) {
  repo_dir <- readLines(repo_dir_file, n = 1, warn = FALSE)
  diag_file <- file.path(repo_dir, 'R', 'diagnostics.R')
  if (file.exists(diag_file)) {
    source(diag_file)
    diagnostics(redact = FALSE)
  } else {
    cat('Diagnostics script not found at:', diag_file, '\n')
  }
} else {
  cat('Cannot find cuplyr source directory.\n')
  cat('Re-run Step 3 to clone the repository.\n')
}

# Dependency check
check_deps()