In [None]:
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <string>
#include <cstring>

using namespace std;

__device__ bool contains_pattern(const char* word, const char* pattern, int word_len, int pat_len) {
    for (int i = 0; i <= word_len - pat_len; ++i) {
        bool match = true;
        for (int j = 0; j < pat_len; ++j) {
            if (word[i + j] != pattern[j]) {
                match = false;
                break;
            }
        }
        if (match) return true;
    }
    return false;
}

__global__ void pattern_match(char** words, int* word_lens, char* pattern, int pat_len, int* match_counts, int num_words) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num_words) {
        char* word = words[idx];
        int len = word_lens[idx];
        match_counts[idx] = contains_pattern(word, pattern, len, pat_len) ? 1 : 0;
    }
}

vector<string> split_words(const string& text) {
    vector<string> words;
    string word;
    for (char c : text) {
        if (isalnum(c)) word += c;
        else {
            if (!word.empty()) {
                words.push_back(word);
                word.clear();
            }
        }
    }
    if (!word.empty()) words.push_back(word);
    return words;
}

int main(int argc, char** argv) {
    if (argc != 3) {
        cerr << "Usage: ./pattern <num_threads> <%pattern%>" << endl;
        return 1;
    }

    int num_threads = stoi(argv[1]);
    string pattern_raw = argv[2];
    string pattern = pattern_raw.substr(1, pattern_raw.length() - 2); // remove %

    // Dummy long input text
    string input_text = "example text with many words like exampleword and hex and x-ray and axis etc xenomorph explanation";

    vector<string> words = split_words(input_text);
    int num_words = words.size();

    // Trim word count to thread count if needed
    if (num_threads < num_words) num_words = num_threads;

    // Allocate device memory
    char** d_words;
    int* d_word_lens;
    int* d_match_counts;
    char* d_pattern;

    cudaMallocManaged(&d_words, num_words * sizeof(char*));
    cudaMallocManaged(&d_word_lens, num_words * sizeof(int));
    cudaMallocManaged(&d_match_counts, num_words * sizeof(int));
    cudaMallocManaged(&d_pattern, pattern.length() * sizeof(char));
    memcpy(d_pattern, pattern.c_str(), pattern.length());

    for (int i = 0; i < num_words; ++i) {
        int len = words[i].length();
        char* d_word;
        cudaMallocManaged(&d_word, len * sizeof(char));
        memcpy(d_word, words[i].c_str(), len);
        d_words[i] = d_word;
        d_word_lens[i] = len;
    }

    cudaDeviceSynchronize();

    int threadsPerBlock = 256;
    int totalThreads = num_threads;
    int blocks = (totalThreads + threadsPerBlock - 1) / threadsPerBlock;

    pattern_match<<<blocks, threadsPerBlock>>>(d_words, d_word_lens, d_pattern, pattern.length(), d_match_counts, num_words);
    cudaDeviceSynchronize();

    int total_matches = 0;
    for (int i = 0; i < num_words; ++i)
        total_matches += d_match_counts[i];

    cout << "Total matches for pattern \"" << pattern_raw << "\": " << total_matches << endl;

    for (int i = 0; i < num_words; ++i)
        cudaFree(d_words[i]);
    cudaFree(d_words);
    cudaFree(d_word_lens);
    cudaFree(d_match_counts);
    cudaFree(d_pattern);

    return 0;
}
