Parallel Code: .cu file

In [84]:
%%writefile heatdiffusion.cu
// heat_diffusion.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>

#define SIZE 512
#define STEPS 100
#define ALPHA 0.25f

double get_time() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec * 1e-6;
}

// CUDA kernel for heat diffusion
__global__ void diffusion_kernel(float* grid, float* new_grid, int size) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < size && y < size) {
        int idx = y * size + x;

        if (x > 0 && x < size-1 && y > 0 && y < size-1) {
            new_grid[idx] = grid[idx] + ALPHA * (
                grid[idx-1] + grid[idx+1] +
                grid[idx-size] + grid[idx+size] - 4*grid[idx]
            );
        } else {
            new_grid[idx] = grid[idx];  // Copy boundary
        }
    }
}

// CPU version of diffusion
void diffusion_cpu(float* grid, float* new_grid, int size) {
    // Copy entire grid first
    for (int i = 0; i < size*size; i++) {
        new_grid[i] = grid[i];
    }

    // Update interior points
    for (int y = 1; y < size-1; y++) {
        for (int x = 1; x < size-1; x++) {
            int idx = y * size + x;
            new_grid[idx] = grid[idx] + ALPHA * (
                grid[idx-1] + grid[idx+1] +
                grid[idx-size] + grid[idx+size] - 4*grid[idx]
            );
        }
    }
}

void init_grid(float* grid, int size) {
    for (int i = 0; i < size*size; i++) grid[i] = 0.0f;

    int center = size / 2;
    int radius = size / 8;

    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            int dx = x - center;
            int dy = y - center;
            if (dx*dx + dy*dy <= radius*radius) {
                grid[y * size + x] = 1.0f;
            }
        }
    }
}

void save_ppm(float* grid, int size, const char* filename) {
    FILE* f = fopen(filename, "wb");
    fprintf(f, "P6\n%d %d\n255\n", size, size);

    for (int i = 0; i < size*size; i++) {
        unsigned char val = (unsigned char)(grid[i] * 255);
        unsigned char rgb[3] = {val, 0, 255-val};  // Red to Blue
        fwrite(rgb, 1, 3, f);
    }
    fclose(f);
}

double run_gpu(float* h_grid, int size) {
    printf("\n=== Running GPU simulation ===\n");

    float *d_grid, *d_new;
    size_t bytes = size * size * sizeof(float);

    cudaMalloc(&d_grid, bytes);
    cudaMalloc(&d_new, bytes);
    cudaMemcpy(d_grid, h_grid, bytes, cudaMemcpyHostToDevice);

    dim3 block(16, 16);
    dim3 grid_dim((size + 15)/16, (size + 15)/16);

    double start = get_time();

    for (int step = 0; step < STEPS; step++) {
        diffusion_kernel<<<grid_dim, block>>>(d_grid, d_new, size);

        float* tmp = d_grid;
        d_grid = d_new;
        d_new = tmp;

        cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);
        char filename[64];
        sprintf(filename, "frames/frame_%04d.ppm", step);
        save_ppm(h_grid, size, filename);
    }

    cudaDeviceSynchronize();
    double end = get_time();
    double elapsed = end - start;

    cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);

    cudaFree(d_grid);
    cudaFree(d_new);

    printf("GPU: Done in %.4f seconds\n", elapsed);
    return elapsed;
}

double run_cpu(float* grid, float* new_grid, int size) {
    printf("\n=== Running CPU simulation ===\n");

    double start = get_time();

    for (int step = 0; step < STEPS; step++) {
        diffusion_cpu(grid, new_grid, size);

        // Swap buffers
        float* tmp = grid;
        grid = new_grid;
        new_grid = tmp;

        char filename[64];
        sprintf(filename, "frames/frame_%04d.ppm", step);
        save_ppm(grid, size, filename);
    }

    double end = get_time();
    double elapsed = end - start;

    printf("CPU: Done in %.4f seconds\n", elapsed);
    return elapsed;
}

int main() {
    system("mkdir -p frames");

    float* h_grid = (float*)malloc(SIZE * SIZE * sizeof(float));
    float* h_new = (float*)malloc(SIZE * SIZE * sizeof(float));

    // GPU simulation
    init_grid(h_grid, SIZE);
    double gpu_time = run_gpu(h_grid, SIZE);

    // CPU simulation (overwrites frames)
    init_grid(h_grid, SIZE);
    double cpu_time = run_cpu(h_grid, h_new, SIZE);

    printf("\n=== Performance Comparison ===\n");
    printf("GPU Time: %.4f seconds\n", gpu_time);
    printf("CPU Time: %.4f seconds\n", cpu_time);
    printf("Speedup: %.2fx faster on GPU\n", cpu_time / gpu_time);

    printf("\n=== All done! ===\n");
    printf("Convert to video with:\n");
    printf("ffmpeg -framerate 30 -i frames/frame_%%04d.ppm -c:v libx264 -pix_fmt yuv420p output.mp4\n");

    free(h_grid);
    free(h_new);
    return 0;
}

Overwriting heatdiffusion.cu


In [85]:
!nvcc heatdiffusion.cu -o heatdiffusion

          unsigned char rgb[3] = {val, 0, 255-val};
                                          ^


          unsigned char rgb[3] = {val, 0, 255-val};
                                          ^


[01m[Kheatdiffusion.cu:[m[K In function ‘[01m[Kvoid save_ppm(float*, int, const char*)[m[K’:
   79 |         unsigned char rgb[3] = {va[01;35m[Kl, 0, 255[m[K-val};  // Red to Blue
      |                                   [01;35m[K~~~~^~~~~[m[K


In [86]:
!./heatdiffusion


=== Running GPU simulation ===
GPU: Done in 1.0262 seconds

=== Running CPU simulation ===
CPU: Done in 1.7125 seconds

=== Performance Comparison ===
GPU Time: 1.0262 seconds
CPU Time: 1.7125 seconds
Speedup: 1.67x faster on GPU

=== All done! ===
Convert to video with:
ffmpeg -framerate 30 -i frames/frame_%04d.ppm -c:v libx264 -pix_fmt yuv420p output.mp4


In [87]:
!ffmpeg -framerate 30 -i frames/frame_%04d.ppm -c:v libx264 -pix_fmt yuv420p output.mp4

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab