In [None]:
!nvidia-smi

Wed Dec 24 14:39:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile practical_4.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <chrono>
#include <algorithm>
#include <random> // Для генерации случайных чисел

#define BLOCK_SIZE 256
#define SORT_SIZE 16

// --- ЗАДАНИЕ 1: ПОДГОТОВКА ДАННЫХ ---
void generateRandomArray(std::vector<float>& arr, int n) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<float> dis(0.0f, 100.0f);

    for (int i = 0; i < n; i++) {
        arr[i] = dis(gen);
    }
}

// --- ЗАДАНИЕ 2: РЕДУКЦИЯ ---

__global__ void sumGlobalKernel(float* d_in, float* d_out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        atomicAdd(d_out, d_in[tid]);
    }
}

__global__ void sumSharedKernel(float* d_in, float* d_out, int n) {
    __shared__ float sdata[BLOCK_SIZE];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? d_in[i] : 0;
    __syncthreads();

    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) sdata[tid] += sdata[tid + s];
        __syncthreads();
    }
    if (tid == 0) atomicAdd(d_out, sdata[0]);
}

// --- ЗАДАНИЕ 3: СОРТИРОВКА (ЛОКАЛЬНАЯ ПАМЯТЬ) ---

__global__ void bubbleSortLocalKernel(float* d_data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int start_idx = tid * SORT_SIZE;

    if (start_idx + SORT_SIZE <= n) {
        float local_arr[SORT_SIZE];

        for (int i = 0; i < SORT_SIZE; i++) {
            local_arr[i] = d_data[start_idx + i];
        }

        for (int i = 0; i < SORT_SIZE - 1; i++) {
            for (int j = 0; j < SORT_SIZE - i - 1; j++) {
                if (local_arr[j] > local_arr[j + 1]) {
                    float temp = local_arr[j];
                    local_arr[j] = local_arr[j + 1];
                    local_arr[j + 1] = temp;
                }
            }
        }

        for (int i = 0; i < SORT_SIZE; i++) {
            d_data[start_idx + i] = local_arr[i];
        }
    }
}

// --- ТЕСТЫ ---

void runTests(int N) {
    std::cout << "\n==========================================" << std::endl;
    std::cout << "TESTING ARRAY SIZE: N = " << N << std::endl;
    std::cout << "==========================================" << std::endl;

    // 1. Генерация данных
    std::vector<float> h_in(N);
    generateRandomArray(h_in, N);
    std::cout << "[Task 1] Random array generated." << std::endl;

    size_t size = N * sizeof(float);
    float *d_in, *d_out;
    cudaMalloc(&d_in, size);
    cudaMalloc(&d_out, sizeof(float));
    cudaMemcpy(d_in, h_in.data(), size, cudaMemcpyHostToDevice);

    int blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;

    // 2. Тест редукции (Задание 2)
    std::cout << "\n[Task 2] Reduction (Summation):" << std::endl;

    // Global
    cudaMemset(d_out, 0, sizeof(float));
    auto s1 = std::chrono::high_resolution_clock::now();
    sumGlobalKernel<<<blocks, BLOCK_SIZE>>>(d_in, d_out, N);
    cudaDeviceSynchronize();
    auto e1 = std::chrono::high_resolution_clock::now();

    // Shared
    cudaMemset(d_out, 0, sizeof(float));
    auto s2 = std::chrono::high_resolution_clock::now();
    sumSharedKernel<<<blocks, BLOCK_SIZE>>>(d_in, d_out, N);
    cudaDeviceSynchronize();
    auto e2 = std::chrono::high_resolution_clock::now();

    std::cout << "  - Global Memory Time: " << std::chrono::duration<double, std::milli>(e1 - s1).count() << " ms" << std::endl;
    std::cout << "  - Shared Memory Time: " << std::chrono::duration<double, std::milli>(e2 - s2).count() << " ms" << std::endl;

    // 3. Тест сортировки (Задание 3)
    std::cout << "\n[Task 3] Local Bubble Sort (Subarrays of 16):" << std::endl;
    int num_subarrays = N / SORT_SIZE;
    int sort_threads = 128;
    int sort_blocks = (num_subarrays + sort_threads - 1) / sort_threads;

    auto s3 = std::chrono::high_resolution_clock::now();
    bubbleSortLocalKernel<<<sort_blocks, sort_threads>>>(d_in, N);
    cudaDeviceSynchronize();
    auto e3 = std::chrono::high_resolution_clock::now();

    std::cout << "  - Local Memory Sorting Time: " << std::chrono::duration<double, std::milli>(e3 - s3).count() << " ms" << std::endl;

    cudaFree(d_in);
    cudaFree(d_out);
}

int main() {
    // Согласно заданию 4: размеры 10,000, 100,000 и 1,000,000
    int sizes[] = {10000, 100000, 1000000};

    std::cout << "PRACTICAL WORK №4: GPU MEMORY OPTIMIZATION" << std::endl;

    for (int n : sizes) {
        runTests(n);
    }

    return 0;
}

Overwriting practical_4.cu


In [None]:
!ls

cuda_sorts  cuda_sorts.cu  main  practical_4.cu  runTest  sample_data


In [None]:
!nvcc practical_4.cu -o main

In [None]:
!./main

PRACTICAL WORK №4: GPU MEMORY OPTIMIZATION

TESTING ARRAY SIZE: N = 10000
[Task 1] Random array generated.

[Task 2] Reduction (Summation):
  - Global Memory Time: 8.63765 ms
  - Shared Memory Time: 0.004893 ms

[Task 3] Local Bubble Sort (Subarrays of 16):
  - Local Memory Sorting Time: 0.002685 ms

TESTING ARRAY SIZE: N = 100000
[Task 1] Random array generated.

[Task 2] Reduction (Summation):
  - Global Memory Time: 0.023735 ms
  - Shared Memory Time: 0.003495 ms

[Task 3] Local Bubble Sort (Subarrays of 16):
  - Local Memory Sorting Time: 0.002434 ms

TESTING ARRAY SIZE: N = 1000000
[Task 1] Random array generated.

[Task 2] Reduction (Summation):
  - Global Memory Time: 0.050972 ms
  - Shared Memory Time: 0.004835 ms

[Task 3] Local Bubble Sort (Subarrays of 16):
  - Local Memory Sorting Time: 0.002358 ms
