In [None]:
!nvidia-smi

Wed Jan 14 15:41:23 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
Добавить комментарии для всех Практисов

In [None]:
%%writefile practical_4.cu

#include <cuda_runtime.h>            // Include CUDA runtime API
#include <iostream>                  // Include standard input/output stream
#include <vector>                    // Include vector container
#include <chrono>                    // Include timing utilities
#include <random>                    // Include random number generation
#include <algorithm>                 // Include algorithm utilities

#define BLOCK_SIZE 256               // Define number of threads per CUDA block
#define SORT_SIZE 16                 // Define size of subarrays for bubble sort

void generateRandomArray(std::vector<float>& arr, int n) { // Function to generate random array
    std::random_device rd;           // Create random device for seeding
    std::mt19937 gen(rd());           // Initialize Mersenne Twister generator
    std::uniform_real_distribution<float> dis(0.0f, 100.0f); // Define uniform float distribution

    for (int i = 0; i < n; i++) {     // Loop over array elements
        arr[i] = dis(gen);            // Assign random value to array element
    }                                 // End loop
}                                     // End function

__global__ void sumGlobalKernel(float* d_in, float* d_out, int n) { // CUDA kernel using global memory
    int tid = blockIdx.x * blockDim.x + threadIdx.x; // Compute global thread index
    if (tid < n) {                 // Check array bounds
        atomicAdd(d_out, d_in[tid]); // Atomically add element to global sum
    }                               // End if
}                                   // End kernel

__global__ void sumSharedKernel(float* d_in, float* d_out, int n) { // CUDA kernel using shared memory
    __shared__ float sdata[BLOCK_SIZE]; // Declare shared memory array
    unsigned int tid = threadIdx.x;     // Get thread index inside block
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; // Compute global index
    sdata[tid] = (i < n) ? d_in[i] : 0.0f; // Load data or zero into shared memory
    __syncthreads();                    // Synchronize all threads

    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { // Reduction loop
        if (tid < s) {              // Check reduction condition
            sdata[tid] += sdata[tid + s]; // Add partial sums
        }                            // End if
        __syncthreads();             // Synchronize after each step
    }                                // End reduction loop

    if (tid == 0) {                  // Check first thread
        atomicAdd(d_out, sdata[0]);  // Add block result to global output
    }                                // End if
}                                    // End kernel

__global__ void bubbleSortLocalKernel(float* d_data, int n) { // Kernel for local bubble sort
    int tid = blockIdx.x * blockDim.x + threadIdx.x; // Compute global thread index
    int start_idx = tid * SORT_SIZE; // Compute start index of subarray

    if (start_idx + SORT_SIZE <= n) { // Check bounds
        float local_arr[SORT_SIZE];   // Declare local array in private memory

        for (int i = 0; i < SORT_SIZE; i++) { // Load data into local memory
            local_arr[i] = d_data[start_idx + i]; // Copy element
        }                               // End load loop

        for (int i = 0; i < SORT_SIZE - 1; i++) { // Bubble sort outer loop
            for (int j = 0; j < SORT_SIZE - i - 1; j++) { // Inner loop
                if (local_arr[j] > local_arr[j + 1]) { // Compare elements
                    float temp = local_arr[j]; // Temporary variable
                    local_arr[j] = local_arr[j + 1]; // Swap values
                    local_arr[j + 1] = temp; // Complete swap
                }                           // End if
            }                               // End inner loop
        }                                   // End outer loop

        for (int i = 0; i < SORT_SIZE; i++) { // Write back sorted data
            d_data[start_idx + i] = local_arr[i]; // Store sorted element
        }                               // End store loop
    }                                   // End bounds check
}                                       // End kernel

void runTests(int N) {                  // Function to run all tests
    std::cout << "\n==========================================" << std::endl; // Print separator
    std::cout << "TESTING ARRAY SIZE: N = " << N << std::endl; // Print array size
    std::cout << "==========================================" << std::endl; // Print separator

    std::vector<float> h_in(N);         // Create host input vector
    generateRandomArray(h_in, N);       // Generate random data
    std::cout << "[Task 1] Random array generated." << std::endl; // Log message

    size_t size = N * sizeof(float);    // Calculate memory size
    float *d_in, *d_out;                // Declare device pointers
    cudaMalloc(&d_in, size);            // Allocate device memory for input
    cudaMalloc(&d_out, sizeof(float));  // Allocate device memory for output

    cudaMemcpy(d_in, h_in.data(), size, cudaMemcpyHostToDevice); // Copy data to GPU

    int blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; // Compute number of blocks

    std::cout << "\n[Task 2] Reduction (Summation):" << std::endl; // Log reduction test

    cudaMemset(d_out, 0, sizeof(float)); // Reset output value
    auto s1 = std::chrono::high_resolution_clock::now(); // Start timer
    sumGlobalKernel<<<blocks, BLOCK_SIZE>>>(d_in, d_out, N); // Launch global kernel
    cudaDeviceSynchronize();             // Synchronize device
    auto e1 = std::chrono::high_resolution_clock::now(); // Stop timer

    cudaMemset(d_out, 0, sizeof(float)); // Reset output value
    auto s2 = std::chrono::high_resolution_clock::now(); // Start timer
    sumSharedKernel<<<blocks, BLOCK_SIZE>>>(d_in, d_out, N); // Launch shared kernel
    cudaDeviceSynchronize();             // Synchronize device
    auto e2 = std::chrono::high_resolution_clock::now(); // Stop timer

    std::cout << "  - Global Memory Time: "
              << std::chrono::duration<double, std::milli>(e1 - s1).count()
              << " ms" << std::endl; // Print global memory time

    std::cout << "  - Shared Memory Time: "
              << std::chrono::duration<double, std::milli>(e2 - s2).count()
              << " ms" << std::endl; // Print shared memory time

    std::cout << "\n[Task 3] Local Bubble Sort (Subarrays of 16):" << std::endl; // Log sorting test

    int num_subarrays = N / SORT_SIZE; // Compute number of subarrays
    int sort_threads = 128;            // Threads per block for sorting
    int sort_blocks = (num_subarrays + sort_threads - 1) / sort_threads; // Compute blocks

    auto s3 = std::chrono::high_resolution_clock::now(); // Start timer
    bubbleSortLocalKernel<<<sort_blocks, sort_threads>>>(d_in, N); // Launch sort kernel
    cudaDeviceSynchronize();            // Synchronize device
    auto e3 = std::chrono::high_resolution_clock::now(); // Stop timer

    std::cout << "  - Local Memory Sorting Time: "
              << std::chrono::duration<double, std::milli>(e3 - s3).count()
              << " ms" << std::endl; // Print sorting time

    cudaFree(d_in);                     // Free device input memory
    cudaFree(d_out);                    // Free device output memory
}                                       // End test function

int main() {                            // Program entry point
    int sizes[] = {10000, 100000, 1000000}; // Define test sizes
    std::cout << "PRACTICAL WORK №4: GPU MEMORY OPTIMIZATION" << std::endl; // Print header
    for (int n : sizes) {               // Loop over sizes
        runTests(n);                    // Run tests
    }                                   // End loop
    return 0;                           // Return success
}                                       // End main


Writing practical_4.cu


In [None]:
!ls

cuda_sorts  cuda_sorts.cu  main  practical_4.cu  runTest  sample_data


In [None]:
!nvcc practical_4.cu -o main

In [None]:
!./main

PRACTICAL WORK №4: GPU MEMORY OPTIMIZATION

TESTING ARRAY SIZE: N = 10000
[Task 1] Random array generated.

[Task 2] Reduction (Summation):
  - Global Memory Time: 49.3795 ms
  - Shared Memory Time: 0.004032 ms

[Task 3] Local Bubble Sort (Subarrays of 16):
  - Local Memory Sorting Time: 0.004556 ms

TESTING ARRAY SIZE: N = 100000
[Task 1] Random array generated.

[Task 2] Reduction (Summation):
  - Global Memory Time: 0.023309 ms
  - Shared Memory Time: 0.004622 ms

[Task 3] Local Bubble Sort (Subarrays of 16):
  - Local Memory Sorting Time: 0.002546 ms

TESTING ARRAY SIZE: N = 1000000
[Task 1] Random array generated.

[Task 2] Reduction (Summation):
  - Global Memory Time: 0.03978 ms
  - Shared Memory Time: 0.004831 ms

[Task 3] Local Bubble Sort (Subarrays of 16):
  - Local Memory Sorting Time: 0.003069 ms
