<a href="https://colab.research.google.com/github/Zamo98/Taller03-Arquitectura-de-computadores-ll/blob/main/Taller03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update -qq
!apt-get install -y -qq build-essential
!apt-get install -y -qq nvidia-cuda-toolkit
!nvcc --version || true
!g++ --version

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Extracting templates from packages: 100%
Preconfiguring packages ...
Selecting previously unselected package libdebuginfod-common.
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../00-libdebuginfod-common_0.186-1ubuntu0.1_all.deb ...
Unpacking libdebuginfod-common (0.186-1ubuntu0.1) ...
Selecting previously unselected package libatspi2.0-0:amd64.
Preparing to unpack .../01-libatspi2.0-0_2.44.0-3_amd64.deb ...
Unpacking libatspi2.0-0:amd64 (2.44.0-3) ...
Selecting previously unselected package libxtst6:amd64.
Preparing to unpack .../02-libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package session-migration.
Preparing to unpack .../03-session-migration_0.3.6_amd64.deb ...
Unpacking s

In [None]:
%%writefile case_converter_SIMD.cu
#include <stdio.h>
#include <stdlib.h>
#include <fstream>
#include <cuda.h>

__global__
void caseConverterKernel(const unsigned char* input, unsigned char* output, size_t n) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Procesar múltiples elementos por hilo
    for(size_t i = idx; i < n; i += blockDim.x * gridDim.x) {
        unsigned char c = input[i];
        output[i] = (c >= 'a' && c <= 'z') ? c - 0x20 :
                   ((c >= 'A' && c <= 'Z') ? c + 0x20 : c);
    }
}

int main(int argc, char** argv) {
    if (argc < 3) {
        printf("usage: cuda_converter input_file output_file\n");
        return 1;
    }
    const char* infile = argv[1];
    const char* outfile = argv[2];

    std::ifstream ifs(infile, std::ios::binary | std::ios::ate);
    if (!ifs.is_open()) {
        printf("Cannot open input file\n");
        return 2;
    }
    size_t size = ifs.tellg();
    ifs.seekg(0, std::ios::beg);

    unsigned char* h_in = (unsigned char*)malloc(size);
    unsigned char* h_out = (unsigned char*)malloc(size);
    if (!h_in || !h_out) {
        printf("Host malloc failed\n");
        return 3;
    }
    ifs.read((char*)h_in, size);
    ifs.close();

    unsigned char *d_in = nullptr, *d_out = nullptr;
    cudaError_t err;
    err = cudaMalloc((void**)&d_in, size);
    if (err != cudaSuccess) {
        printf("cudaMalloc d_in failed: %s\n", cudaGetErrorString(err));
        return 4;
    }
    err = cudaMalloc((void**)&d_out, size);
    if (err != cudaSuccess) {
        printf("cudaMalloc d_out failed: %s\n", cudaGetErrorString(err));
        cudaFree(d_in);
        return 5;
    }

    err = cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        printf("cudaMemcpy H2D failed: %s\n", cudaGetErrorString(err));
        cudaFree(d_in); cudaFree(d_out);
        return 6;
    }

    int threads = 256;
    int blocks = (int)((size + threads - 1) / threads);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    caseConverterKernel<<<blocks, threads>>>(d_in, d_out, size);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms = 0.0f;
    cudaEventElapsedTime(&ms, start, stop);

    err = cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        printf("cudaMemcpy D2H failed: %s\n", cudaGetErrorString(err));
        cudaFree(d_in); cudaFree(d_out);
        free(h_in); free(h_out);
        return 7;
    }

    std::ofstream ofs(outfile, std::ios::binary);
    ofs.write((char*)h_out, size);
    ofs.close();

    printf("cuda_time_ms %.6f\n", (double)ms);

    cudaFree(d_in);
    cudaFree(d_out);
    free(h_in);
    free(h_out);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return 0;
}

Writing case_converter_SIMD.cu


In [None]:
%%writefile case_converter_serial.cpp
#include <iostream>
#include <fstream>
#include <chrono>
#include <cstdlib>

using namespace std;

void convertSerial(const unsigned char* input, unsigned char* output, size_t size) {
    for (size_t i = 0; i < size; ++i) {
        unsigned char c = input[i];
        if (c >= 'a' && c <= 'z') output[i] = c - 0x20;
        else if (c >= 'A' && c <= 'Z') output[i] = c + 0x20;
        else output[i] = c;
    }
}

int main(int argc, char** argv) {
    if (argc < 3) {
        cerr << "Usage: serial_converter input_file output_file\n";
        return 1;
    }
    const char* infile = argv[1];
    const char* outfile = argv[2];

    ifstream ifs(infile, ios::binary | ios::ate);
    if (!ifs.is_open()) {
        cerr << "Cannot open input file\n";
        return 2;
    }
    size_t size = ifs.tellg();
    ifs.seekg(0, ios::beg);

    unsigned char* inbuf = (unsigned char*)malloc(size);
    unsigned char* outbuf = (unsigned char*)malloc(size);
    if (!inbuf || !outbuf) {
        cerr << "malloc failed\n";
        return 3;
    }

    ifs.read((char*)inbuf, size);
    ifs.close();

    auto start = chrono::high_resolution_clock::now();
    convertSerial(inbuf, outbuf, size);
    auto end = chrono::high_resolution_clock::now();

    double ms = chrono::duration<double, milli>(end - start).count();
    cout << "serial_time_ms " << ms << endl;

    ofstream ofs(outfile, ios::binary);
    ofs.write((char*)outbuf, size);
    ofs.close();

    free(inbuf);
    free(outbuf);
    return 0;
}

Writing case_converter_serial.cpp


In [None]:
%%writefile generator.py
import random
import sys

def generate_bytes(size, percent, aligned):
    data = bytearray(size)
    for i in range(size):
        if random.random() < percent / 100.0:
            data[i] = ord(random.choice("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"))
        else:
            data[i] = random.randint(32, 126)

    if aligned:
        # CUDA alignment: 32 bytes
        while len(data) % 32 != 0:
            data.append(ord(' '))
    return bytes(data)

if __name__ == "__main__":
    if len(sys.argv) < 5:
        print("Usage: python3 generator.py outpath size percent aligned(0/1)")
        sys.exit(1)
    outpath = sys.argv[1]
    size = int(sys.argv[2])
    percent = float(sys.argv[3])
    aligned = int(sys.argv[4])
    buf = generate_bytes(size, percent, aligned)
    with open(outpath, "wb") as f:
        f.write(buf)
    print(f"Generated: {outpath} | Size: {len(buf)} bytes | Aligned: {aligned} | Original: {size} bytes")

Overwriting generator.py


In [None]:
%%writefile run_experiments.py
import os
import subprocess
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shutil

OUTDIR = "exp_full"
GRAPHDIR = os.path.join(OUTDIR, "graphs")
os.makedirs(OUTDIR, exist_ok=True)
os.makedirs(GRAPHDIR, exist_ok=True)

SIZES = np.linspace(100000, 10000000, 50).astype(int)
PERCENTS = [0,10,20,30,40,50,60,70,80,90]
ALIGNMENTS = [0,1]

SERIAL_EXE = "./serial_converter"
CUDA_EXE = "./cuda_converter"
GEN = "python3 generator.py"

def check_gpu():
    try:
        out = subprocess.check_output(["nvidia-smi"], stderr=subprocess.STDOUT).decode()
        print("nvidia-smi OK")
        return True
    except Exception as e:
        print("nvidia-smi failed or not present:", e)
        return False

def compile_all():
    if not os.path.exists(SERIAL_EXE):
        print("Compiling serial...")
        subprocess.run(["g++", "case_converter_serial.cpp", "-O2", "-o", "serial_converter"], check=True)
    if not os.path.exists(CUDA_EXE):
        print("Compiling cuda...")
        subprocess.run(["nvcc", "-std=c++17", "case_converter_SIMD.cu", "-O2", "-o", "cuda_converter"], check=True)

def run_case(s, p, a):
    base = f"S{s}_P{p}_A{a}"
    infile = os.path.join(OUTDIR, base + ".in")
    out_serial = os.path.join(OUTDIR, base + ".serial")
    out_cuda = os.path.join(OUTDIR, base + ".cuda")

    # generate input
    subprocess.run(["python3", "generator.py", infile, str(s), str(p), str(a)], check=True)

    # serial
    t0 = time.time()
    p1 = subprocess.run([SERIAL_EXE, infile, out_serial], capture_output=True, text=True)
    t1 = time.time()
    serial_wall = (t1 - t0) * 1000.0
    serial_evt = None
    for line in p1.stdout.splitlines():
        if "serial_time_ms" in line:
            try:
                serial_evt = float(line.split()[1])
            except:
                serial_evt = None

    # cuda
    t0 = time.time()
    p2 = subprocess.run([CUDA_EXE, infile, out_cuda], capture_output=True, text=True)
    t1 = time.time()
    cuda_wall = (t1 - t0) * 1000.0
    cuda_evt = None
    for line in p2.stdout.splitlines():
        if "cuda_time_ms" in line:
            try:
                cuda_evt = float(line.split()[1])
            except:
                cuda_evt = None

    # validate (byte-wise)
    valid = False
    try:
        with open(out_serial, "rb") as f1, open(out_cuda, "rb") as f2:
            valid = f1.read() == f2.read()
    except:
        valid = False

    # debug info for failures
    if not valid:
        # save sample diffs to help debugging (first mismatch)
        try:
            a1 = open(out_serial, "rb").read()
            a2 = open(out_cuda, "rb").read()
            minl = min(len(a1), len(a2))
            mismatch = None
            for i in range(minl):
                if a1[i] != a2[i]:
                    mismatch = (i, a1[i], a2[i])
                    break
            if mismatch:
                # write a small debug file
                dbg = os.path.join(OUTDIR, base + ".diff.txt")
                with open(dbg, "w") as dbgf:
                    dbgf.write(f"mismatch_index {mismatch[0]} serial_byte {mismatch[1]} cuda_byte {mismatch[2]}\n")
        except Exception:
            pass

    row = {
        "size": int(s),
        "percent": int(p),
        "alignment": int(a),
        "serial_ms": serial_evt,
        "cuda_ms": cuda_evt,
        "serial_wall": serial_wall,
        "cuda_wall": cuda_wall,
        "speedup": (serial_evt / cuda_evt) if (serial_evt and cuda_evt and cuda_evt > 0) else None,
        "valid": bool(valid)
    }
    return row

def main():
    print("Check GPU presence:")
    has_gpu = check_gpu()
    compile_all()

    rows = []
    total = len(SIZES) * len(PERCENTS) * len(ALIGNMENTS)
    c = 0
    for s in SIZES:
        for p in PERCENTS:
            for a in ALIGNMENTS:
                c += 1
                print(f"[{c}/{total}] size={s}, percent={p}, aligned={a}")
                rows.append(run_case(s, p, a))

    df = pd.DataFrame(rows)
    csvp = os.path.join(OUTDIR, "results_full.csv")
    df.to_csv(csvp, index=False)
    print("CSV saved to", csvp)

    # Generate graphs for all 10 percentages
    def safe_plot(x, y, label):
        # filter NaN and ensure same lengths
        pairs = [(xx, yy) for xx, yy in zip(list(x), list(y)) if yy == yy]
        if len(pairs) == 0:
            print("[WARN] no data for", label)
            return
        xs, ys = zip(*pairs)
        plt.plot(xs, ys, "-o", label=label)

    for P in PERCENTS:
        print("Plotting P =", P)
        df_s = df[df.percent == P].sort_values("size").reset_index(drop=True)
        df_ca = df[(df.percent == P) & (df.alignment == 1)].sort_values("size").reset_index(drop=True)
        df_cu = df[(df.percent == P) & (df.alignment == 0)].sort_values("size").reset_index(drop=True)

        if df_s.serial_ms.notnull().sum() == 0:
            print("[WARN] no serial data for percent", P)
            continue

        max_serial = df_s.serial_ms.max() if df_s.serial_ms.max() else 1.0
        norm_s = df_s.serial_ms / max_serial
        norm_ca = df_ca.cuda_ms / max_serial if len(df_ca) > 0 else pd.Series([])
        norm_cu = df_cu.cuda_ms / max_serial if len(df_cu) > 0 else pd.Series([])

        plt.figure(figsize=(10,6))
        safe_plot(df_s["size"], norm_s, "Serial")
        safe_plot(df_ca["size"], norm_ca, "CUDA Aligned 32-byte")
        safe_plot(df_cu["size"], norm_cu, "CUDA Unaligned")
        plt.title(f"Comparativa serial vs cuda {P}%")
        plt.xlabel("Input Size")
        plt.ylabel("Execution time")
        plt.grid(True)
        plt.legend()
        plt.savefig(os.path.join(GRAPHDIR, f"normalized_{P}.png"), dpi=200)
        plt.close()

    # heatmap for CUDA times (percent vs size)
    pivot = df.pivot_table(values="cuda_ms", index="size", columns="percent")
    plt.figure(figsize=(10,6))
    plt.imshow(pivot, aspect='auto', cmap="viridis")
    plt.colorbar(label="ms")
    plt.xlabel("Percent letters")
    plt.ylabel("Input size index")
    plt.title("CUDA Exec Time Heatmap")
    plt.savefig(os.path.join(GRAPHDIR, "heatmap_cuda.png"), dpi=200)
    plt.close()

    print("Graphs saved to", GRAPHDIR)
    print("Done.")

if __name__ == "__main__":
    main()

Writing run_experiments.py


In [None]:
!/usr/local/cuda/bin/nvcc -arch=sm_75 -o cuda_converter case_converter_SIMD.cu
!g++ -o serial_converter case_converter_serial.cpp -std=c++11
!python run_experiments.py

Check GPU presence:
nvidia-smi OK
[1/1000] size=100000, percent=0, aligned=0
Generated: exp_full/S100000_P0_A0.in | Size: 100000 bytes | Aligned: 0 | Original: 100000 bytes
[2/1000] size=100000, percent=0, aligned=1
Generated: exp_full/S100000_P0_A1.in | Size: 100000 bytes | Aligned: 1 | Original: 100000 bytes
[3/1000] size=100000, percent=10, aligned=0
Generated: exp_full/S100000_P10_A0.in | Size: 100000 bytes | Aligned: 0 | Original: 100000 bytes
[4/1000] size=100000, percent=10, aligned=1
Generated: exp_full/S100000_P10_A1.in | Size: 100000 bytes | Aligned: 1 | Original: 100000 bytes
[5/1000] size=100000, percent=20, aligned=0
Generated: exp_full/S100000_P20_A0.in | Size: 100000 bytes | Aligned: 0 | Original: 100000 bytes
[6/1000] size=100000, percent=20, aligned=1
Generated: exp_full/S100000_P20_A1.in | Size: 100000 bytes | Aligned: 1 | Original: 100000 bytes
[7/1000] size=100000, percent=30, aligned=0
Generated: exp_full/S100000_P30_A0.in | Size: 100000 bytes | Aligned: 0 | Origin