In [None]:
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <cstring>

using namespace std;

__device__ bool contains_pattern(const char* word, const char* pattern, int word_len, int pat_len) {
    for (int i = 0; i <= word_len - pat_len; ++i) {
        bool match = true;
        for (int j = 0; j < pat_len; ++j) {
            if (word[i + j] != pattern[j]) {
                match = false;
                break;
            }
        }
        if (match) return true;
    }
    return false;
}

__global__ void pattern_match(char** words, int* word_lens, char* pattern, int pat_len, int* match_counts, int num_words) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num_words) {
        char* word = words[idx];
        int len = word_lens[idx];
        if (contains_pattern(word, pattern, len, pat_len)) {
            match_counts[idx] = 1;
        } else {
            match_counts[idx] = 0;
        }
    }
}

vector<string> split_words(const string& text) {
    vector<string> words;
    string word;
    for (char c : text) {
        if (isalnum(c) || c == '-') word += c;
        else {
            if (!word.empty()) {
                words.push_back(word);
                word.clear();
            }
        }
    }
    if (!word.empty()) words.push_back(word);
    return words;
}

int main(int argc, char** argv) {
    if (argc != 3) {
        cerr << "Usage: ./pattern_cuda <input_file> <%pattern%>" << endl;
        return 1;
    }

    string filename = argv[1];
    string pattern_raw = argv[2];
    string pattern = pattern_raw.substr(1, pattern_raw.length() - 2); // remove %

    ifstream file(filename);
    if (!file) {
        cerr << "Failed to open input file." << endl;
        return 1;
    }

    stringstream buffer;
    buffer << file.rdbuf();
    file.close();
    string text = buffer.str();

    vector<string> words = split_words(text);
    int num_words = words.size();

    // Allocate memory
    char** d_words;
    int* d_word_lens;
    int* d_match_counts;
    char* d_pattern;

    // Allocate host-pinned memory
    cudaMallocManaged(&d_words, num_words * sizeof(char*));
    cudaMallocManaged(&d_word_lens, num_words * sizeof(int));
    cudaMallocManaged(&d_match_counts, num_words * sizeof(int));

    // Copy pattern
    int pat_len = pattern.length();
    cudaMallocManaged(&d_pattern, pat_len * sizeof(char));
    memcpy(d_pattern, pattern.c_str(), pat_len);

    // Allocate memory for each word
    for (int i = 0; i < num_words; ++i) {
        int len = words[i].length();
        char* d_word;
        cudaMallocManaged(&d_word, len * sizeof(char));
        memcpy(d_word, words[i].c_str(), len);
        d_words[i] = d_word;
        d_word_lens[i] = len;
    }

    cudaDeviceSynchronize();

    // Timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    int threadsPerBlock = 256;
    int blocks = (num_words + threadsPerBlock - 1) / threadsPerBlock;
    pattern_match<<<blocks, threadsPerBlock>>>(d_words, d_word_lens, d_pattern, pat_len, d_match_counts, num_words);
    cudaDeviceSynchronize();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Count matches
    int total_matches = 0;
    for (int i = 0; i < num_words; ++i)
        total_matches += d_match_counts[i];

    cout << "Total matches for pattern \"" << pattern_raw << "\": " << total_matches << endl;
    cout << "Execution time: " << milliseconds / 1000.0 << " seconds" << endl;

    // Cleanup
    for (int i = 0; i < num_words; ++i)
        cudaFree(d_words[i]);
    cudaFree(d_words);
    cudaFree(d_word_lens);
    cudaFree(d_match_counts);
    cudaFree(d_pattern);

    return 0;
}
