# Semantic Analysis on GPU using cuda

## Step 0: Import necessary libraries

In [14]:
import pycuda.autoinit
import pycuda.driver as cuda
import pandas as pd
import re
from collections import Counter
from typing import List, Dict, Set
import numpy as np
from pycuda.compiler import SourceModule

In [3]:
# Print the name of the GPU
print(cuda.Device(0).name())

NVIDIA GeForce GTX 1660 Ti


## Step 1: Dataset Preprocessing on CPU
1. load the csv dataset
2. clean the text
3. tokenize the text
4. buliding the vocabulary with this format {word:index}

In [4]:
# Load CSV and return list of dicts or DataFrame
def load_csv(filename: str) -> pd.DataFrame:
    """
    Load a CSV file with columns:
    - review: text of the review
    - sentiment: positive/negative
    Returns a pandas DataFrame.
    """
    df = pd.read_csv(filename)
    # Remove empty reviews if any
    df = df.dropna(subset=["review"])
    return df

In [5]:
# Clean text: lowercase, remove HTML tags, numbers, punctuation
def clean_text(text: str) -> str:
    text = text.lower()                       # Convert to lowercase 
    text = re.sub(r"<[^>]+>", " ", text)      # Remove HTML tags
    text = re.sub(r"\d+", " ", text)          # Remove numbers
    text = re.sub(r"[^\w\s]", " ", text)      # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

In [6]:
# Tokenize text: split by space
def tokenize(text: str) -> List[str]:
    return text.split()

In [7]:
# Build vocabulary: all unique words across all reviews, minus stopwords
def build_vocabulary(reviews: List[str], stopwords: Set[str]) -> Dict[str, int]:
    """
    reviews: list of cleaned review texts
    stopwords: set of words to ignore
    Returns: {word: index}
    """
    vocab_set = set()
    for review in reviews:
        tokens = tokenize(review)
        for word in tokens:
            if len(word) >= 2 and word not in stopwords:
                vocab_set.add(word)
    # Assign index to each word
    vocab = {word: idx for idx, word in enumerate(sorted(vocab_set))}
    return vocab

## Step 2: Transform Text into Numeric Data on CPU
1. preprocess dataset using step 1 funtions
2. convert text into numeric arrays

In [8]:
# Convert a review to a numeric array 
def reviews_to_token_indices(
    reviews: List[str],
    vocab: Dict[str, int]
) -> List[np.ndarray]:
    """
    Convert each cleaned review to an array of token indices.
    """
    token_arrays = []
    for text in reviews:
        tokens = tokenize(text)
        indices = [vocab[word] for word in tokens if word in vocab]
        token_arrays.append(np.array(indices, dtype=np.int32))
    return token_arrays

In [9]:
# Load the CSV dataset
df = load_csv('./Minimal_IMDB_Dataset.csv')

# Clean all reviews
df["cleaned"] = df["review"].apply(clean_text)

# Prepare your stopwords set
stopwords = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself",
    "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those",
    "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing",
    "a", "an", "the", "and", "but", "if", "or", "because", "as",
    "until", "while", "of", "at", "by", "for", "with", "about", "against",
    "between", "into", "through", "during", "before", "after", "above", "below",
    "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
    "again", "further", "then", "once", "here", "there", "when", "where",
    "why", "how", "all", "any", "both", "each", "few", "more", "most",
    "other", "some", "such", "no", "nor", "not", "only", "own", "same",
    "so", "than", "too", "very", "can", "will", "just", "don", "should", "now"
}

# Build the vocab
vocab = build_vocabulary(df["cleaned"].tolist(), stopwords)

In [10]:
# Convert labels to 0/1
y = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0).to_numpy()

# Convert cleaned reviews to token index arrays
X_token_indices = reviews_to_token_indices(df["cleaned"].tolist(), vocab)

In [13]:
# type(X_token_indices)
type(y)

numpy.ndarray

## Step 3: Compute TF (Term Frequencies) on GPU
1. CUDA kernel to count token frequencies.
2. Allocate GPU memory
3. Transfer token arrays to GPU
4. Launche the kernel
5. Copies results back to CPU

In [16]:
# Write a CUDA kernel to count token frequencies
mod = SourceModule("""
__global__ void compute_tf(int *tokens, int *counts, int num_tokens, int vocab_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= num_tokens)
        return;

    int token = tokens[idx];
    if (token >= 0 && token < vocab_size) {
        atomicAdd(&counts[token], 1);
    }
}
""")

In [17]:
def compute_tf(tokens: np.ndarray, vocab_size: int) -> np.ndarray:
    """
    Compute term frequencies for one review.
    tokens: array of token indices (int32)
    vocab_size: size of vocab
    Returns: frequency array (int32)
    """
    # Prepare output array
    counts_host = np.zeros(vocab_size, dtype=np.int32)

    # Allocate GPU memory
    tokens_gpu = cuda.mem_alloc(tokens.nbytes)
    counts_gpu = cuda.mem_alloc(counts_host.nbytes)

    # Copy tokens to GPU
    cuda.memcpy_htod(tokens_gpu, tokens)
    cuda.memcpy_htod(counts_gpu, counts_host)

    # Kernel function
    kernel = mod.get_function("compute_tf")

    # Launch kernel
    block_size = 256
    grid_size = int((tokens.size + block_size - 1) / block_size)

    kernel(
        tokens_gpu,
        counts_gpu,
        np.int32(tokens.size),
        np.int32(vocab_size),
        block=(block_size,1,1),
        grid=(grid_size,1)
    )

    # Copy results back
    cuda.memcpy_dtoh(counts_host, counts_gpu)

    return counts_host


In [18]:
tf_vectors = []
for tokens in X_token_indices:
    tf = compute_tf(tokens, vocab_size=len(vocab))
    tf_vectors.append(tf)

## Step 4: Compute IDF (Inverse Document Frequencies) on GPU

## Step 5: Compute TF-IDF Vectors on GPU

## Step 6: Train Logistic Regression Model on GPU

## Step 7: Evaluate Accuracy

## Step 8: Interactive Prediction