In [None]:
!git clone https://github.com/andreinechaev/nvcc4jupyter.git
!pip install ./nvcc4jupyter
%load_ext nvcc4jupyter

In [9]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#define N 10000000
#define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
    int index = threadIdx.x;
    int stride = blockDim.x;

    for(int i = index; i < n; i += stride){
        out[i] = a[i] + b[i];
    }
}
// What else to change?
// Executing kernel
// vector_add<<<1,256>>>(d_out, d_a, d_b, N);
int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    // Allocate host memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    // Initialize host arrays
    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);

    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

    // Executing kernel (1block, 1thread: no parallelism)
    vector_add<<<1,256>>>(d_out, d_a, d_b, N);

    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

    // Verification
    for(int i = 0; i < N; i++){
        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");

    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);

    // Deallocate host memory
    free(a);
    free(b);
    free(out);
}

out[0] = 3.000000
PASSED



In [None]:
# # check cuda version
# !nvcc --version

# # check compute capability
# !nvidia-smi --query-gpu=compute_cap --format=csv

In [None]:
%%writefile kernal.cu

// Ahmed Sayed
// Sec:    1 - BN:     3
// Code:   9202111

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define MAX_ERR 1e-6
#define BLOCK_SIZE 5
#define MAX_LEVEL 4
#define debug 0
/**
 * @brief Binary search for an eleemnt in the vector
 * @param level : current level of the recursion calls
 * @param start : start index of the segment
 * @param end : end index of the segment
 * @param res : the final result of the binary search , -1 if not found , integer value between 0 and n-1 if found
 * @param val : value to be searched
 * @param numbers : Input numbers
 */
// __global__ void binary_search(float*numbers,float* search_value,int* res,int start , int end ,int level)
// {
//     if(end<start)
//         return;
//     if(level==0)
//       *res =-1;

//     __syncthreads();
//     int mid = (start+end)/2;
//     if(fabs(numbers[mid]-*val)<=MAX_ERR)
//     {
//         *res=mid;
//         return;
//     }
//     else if(numbers[mid] < *val){
//         start = mid+1;
//     }else {
//         end= mid-1;
//     }
//     binary_search<<< 1 , 1 >>>(numbers,search_value,res,start,end,level );
//     __syncthreads();

// }

__global__ void binary_search(float *numbers, float *search_value, int *res, int start, int end, int level)
{

    if (level == 1 && threadIdx.x == 0)
    {
        *res = -1;
        // for(int i=0;i<10;i++)printf("%f\n",numbers[i]);
    }
    if (end < start)
        return;
    __shared__ int first_larger;
    __shared__ int lock;
    if (threadIdx.x == 0)
    {
        first_larger = blockDim.x - 1;
        lock = 0;
    }
    // printf("curent level %d , %d , %d\n",level,start,end);
    __syncthreads();
    if (level == MAX_LEVEL)
    {

        int l = start, r = end;
        int mid;
        while (l <= r)
        {
            mid = (l + r) / 2;
            // printf("%d , %f\n",mid,numbers[mid]);

            if (fabs(numbers[mid] - *search_value) <= MAX_ERR)
            {
                *res = mid;
                return;
            }
            else if (numbers[mid] > *search_value)
            {
                r = mid - 1;
            }
            else
            {
                l = mid + 1;
            }
        }
    }
    else
    {
        int size = end - start + 1;
        int seg_size = size / blockDim.x;
        int index = (threadIdx.x < (blockDim.x - 1)) ? (start + (threadIdx.x + 1) * seg_size) : end;
        // printf("level %d ,threadindex %d ,seg_szie %d ,index %d ,res %d\n",level,threadIdx.x,seg_size,index,*res);
        __syncthreads();
        if (fabs(numbers[index] - *search_value) <= MAX_ERR)
        {
            *res = index;
        }
        else if (numbers[index] > *search_value)
        {
            while (atomicExch(&lock, 1) != 0)
            {
                // Busy-wait until the lock is acquired
            }

            // first_larger = atomicMin(&first_larger,threadIdx.x);
            if (threadIdx.x < first_larger)
                first_larger = threadIdx.x;
            atomicExch(&lock, 0);
        }
        __syncthreads();
        if (*res != -1)
            return;
        int new_end = (first_larger < (blockDim.x - 1)) ? (start + (first_larger + 1) * seg_size) : end;
        int new_start = start + first_larger * seg_size;
        int blockdim = (level + 1 < MAX_LEVEL) ? blockDim.x : 1;

        if (threadIdx.x == 0)
        {
            // printf("start %d,end %d\n",new_start,new_end);
            binary_search<<<gridDim.x, blockdim>>>(numbers, search_value, res, new_start, new_end, level + 1);
        }
    }

    __syncthreads();
}

int main(int argc, char *argv[])
{
    if (argc < 3)
    {
        fprintf(stderr, "number of parameters must be 3\n");
        return 1;
    }

    // read the numbers from the file
    FILE *data = fopen(argv[1], "r");
    if (data == NULL)
    {
        fprintf(stderr, "Cannot open the file\n");
        return 1;
    }

    // read the value to be searched
    float value = atof(argv[2]);

    // Count the number of floats in the file
    int size = 0;
    float temp;
    while (fscanf(data, "%f", &temp) == 1)
    {
        size++;
    }
    // Allocate memory for the array
    float *numbers = (float *)malloc(size * sizeof(float));
    int *res = (int *)malloc(1 * sizeof(int));
    if (numbers == NULL || res == NULL)
    {
        fprintf(stderr, "Memory allocation failed\n");
        fclose(data);
        return 1;
    }

    // Reset the file pointer to the beginning of the file
    rewind(data);

    // Read the floats into the array
    for (int i = 0; i < size; i++)
    {
        if (fscanf(data, "%f", &numbers[i]) != 1)
        {
            fprintf(stderr, "Error reading the file\n");
            free(numbers);
            fclose(data);
            return 1;
        }
    }
    // Close the file
    fclose(data);

    /* CUDA */
    float *cuda_numbers;
    float *cuda_value;
    int *cuda_out;
    cudaMalloc((void **)&cuda_numbers, size * sizeof(float));
    cudaMemcpy(cuda_numbers, numbers, size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMalloc((void **)&cuda_value, 1 * sizeof(float));
    cudaMemcpy(cuda_value, &value, 1 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMalloc((void **)&cuda_out, 1 * sizeof(int));

    // call the kernal
    binary_search<<<1, BLOCK_SIZE>>>(cuda_numbers, cuda_value, cuda_out, 0, size - 1, 1);

    // copy the result from the gpu to cpu
    cudaMemcpy(res, cuda_out, sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d", *res);

    // only for debug

    if (debug == 1)
    {
        int error = 0;
        for (int i = 0; i < size; i++)
        {
            value = numbers[i];
            cudaMemcpy(cuda_value, &value, 1 * sizeof(float), cudaMemcpyHostToDevice);
            // call the kernal
            binary_search<<<1, BLOCK_SIZE>>>(cuda_numbers, cuda_value, cuda_out, 0, size - 1, 1);

            // copy the result from the gpu to cpu
            cudaMemcpy(res, cuda_out, sizeof(int), cudaMemcpyDeviceToHost);

            if (fabs(numbers[*res] - numbers[i]) > MAX_ERR)
            {
                printf("error at index %d and value %f\n", i, value);
                error++;
            }
        }
        printf("\nnumber of errors is %d\n", error);
    }

    // free gpu memory
    cudaFree(cuda_numbers);
    cudaFree(cuda_value);
    cudaFree(cuda_out);

    // free cpu memory
    free(numbers);
    free(res);

    return 0;
}

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>

#define N 10000000
#define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i ++){
        out[i] = a[i] + b[i];
    }
}
int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    // Allocate host memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    // Initialize host arrays
    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);

    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

    // Executing kernel (1block, 1thread: no parallelism)
    vector_add<<<1,1>>>(d_out, d_a, d_b, N);

    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

    // Verification
    for(int i = 0; i < N; i++){
        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");

    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);

    // Deallocate host memory
    free(a);
    free(b);
    free(out);
}

In [None]:
%%cu
#include <iostream>
int main()
{
    std::cout << "Hello world\n";
    return 0;
}


In [None]:
import random

def generate_sorted_floats(n, filename):
    # Generate random floats
    floats = [round(random.uniform(0, 100), 2) for _ in range(n)]

    # Sort the floats
    floats.sort()

    # Write sorted floats to the file
    with open(filename, 'w') as file:
        for num in floats:
            file.write(f"{num}\n")

def binary_search(value, filename):
    with open(filename, 'r') as file:
        numbers = [float(line.strip()) for line in file]

    left, right = 0, len(numbers) - 1

    while left <= right:
        mid = (left + right) // 2
        if numbers[mid] == value:
            return mid
        elif numbers[mid] < value:
            left = mid + 1
        else:
            right = mid - 1

    return -1

if __name__ == "__main__":
    n = int(input("Enter the number of float numbers to generate: "))
    filename = "float_numbers.txt"
    generate_sorted_floats(n, filename)

    search_value = float(input("Enter the value to search: "))
    pos = binary_search(search_value, filename)

    if pos != -1:
        print(f"Value {search_value} found at position {pos}.")
    else:
        print(f"Value {search_value} not found.")

In [None]:
# Note: essential compilation flags for cdp
!nvcc kernal.cu -o run -rdc=true -lcudadevrt
!nvprof ./run float_numbers.txt 1.28

In [None]:
!pwd
