# Semantic Analysis on GPU using cuda

## Step 0: Import necessary libraries

In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.1.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.2.2-py3-none-any.whl.metadata (2.9 kB)
Collecting siphash24>=1.6 (from pytools>=2011.2->pycuda)
  Downloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Downloading pytools-2025.2.2-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.1/98.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64

In [38]:
import pycuda.autoinit
import pycuda.driver as cuda
import pandas as pd
import re
from collections import Counter
from typing import List, Dict, Set
import numpy as np
from pycuda.compiler import SourceModule

In [39]:
# Print the name of the GPU
print(cuda.Device(0).name())

Tesla T4


## Step 1: Dataset Preprocessing on CPU
1. load the csv dataset
2. clean the text
3. tokenize the text
4. buliding the vocabulary with this format {word:index}

In [40]:
# Load CSV and return list of dicts or DataFrame
def load_csv(filename: str) -> pd.DataFrame:
    """
    Load a CSV file with columns:
    - review: text of the review
    - sentiment: positive/negative
    Returns a pandas DataFrame.
    """
    df = pd.read_csv(filename)
    # Remove empty reviews if any
    df = df.dropna(subset=["review"])
    return df

In [41]:
# Clean text: lowercase, remove HTML tags, numbers, punctuation
def clean_text(text: str) -> str:
    text = text.lower()                       # Convert to lowercase
    text = re.sub(r"<[^>]+>", " ", text)      # Remove HTML tags
    text = re.sub(r"\d+", " ", text)          # Remove numbers
    text = re.sub(r"[^\w\s]", " ", text)      # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

In [42]:
# Tokenize text: split by space
def tokenize(text: str) -> List[str]:
    return text.split()

In [43]:
# Build vocabulary: all unique words across all reviews, minus stopwords
def build_vocabulary(reviews: List[str], stopwords: Set[str]) -> Dict[str, int]:
    """
    reviews: list of cleaned review texts
    stopwords: set of words to ignore
    Returns: {word: index}
    """
    vocab_set = set()
    for review in reviews:
        tokens = tokenize(review)
        for word in tokens:
            if len(word) >= 2 and word not in stopwords:
                vocab_set.add(word)
    # Assign index to each word
    vocab = {word: idx for idx, word in enumerate(sorted(vocab_set))}
    return vocab

## Step 2: Transform Text into Numeric Data on CPU
1. preprocess dataset using step 1 funtions
2. convert text into numeric arrays

In [44]:
# Convert a review to a numeric array
def reviews_to_token_indices(
    reviews: List[str],
    vocab: Dict[str, int]
) -> List[np.ndarray]:
    """
    Convert each cleaned review to an array of token indices.
    """
    token_arrays = []
    for text in reviews:
        tokens = tokenize(text)
        indices = [vocab[word] for word in tokens if word in vocab]
        token_arrays.append(np.array(indices, dtype=np.int32))
    return token_arrays

In [45]:
# Load the CSV dataset
df = load_csv('./Minimal_IMDB_Dataset.csv')
# df = load_csv('./Original_IMDB_Dataset.csv')
# df = load_csv('/content/drive/MyDrive/Original_IMDB_Dataset.csv')


# Clean all reviews
df["cleaned"] = df["review"].apply(clean_text)

# Prepare your stopwords set
stopwords = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself",
    "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those",
    "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing",
    "a", "an", "the", "and", "but", "if", "or", "because", "as",
    "until", "while", "of", "at", "by", "for", "with", "about", "against",
    "between", "into", "through", "during", "before", "after", "above", "below",
    "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
    "again", "further", "then", "once", "here", "there", "when", "where",
    "why", "how", "all", "any", "both", "each", "few", "more", "most",
    "other", "some", "such", "no", "nor", "not", "only", "own", "same",
    "so", "than", "too", "very", "can", "will", "just", "don", "should", "now"
}

# Build the vocab
vocab = build_vocabulary(df["cleaned"].tolist(), stopwords)

In [46]:
print(df.columns)

print(df["review"])

Index(['review', 'sentiment', 'cleaned'], dtype='object')
0     One of the other reviewers has mentioned that ...
1     A wonderful little production. <br /><br />The...
2     I thought this was a wonderful way to spend ti...
3     Basically there's a family where a little boy ...
4     Petter Mattei's "Love in the Time of Money" is...
                            ...                        
94    I watched this series out of curiosity,wanting...
95    Daniel Day-Lewis is the most versatile actor a...
96    My guess would be this was originally going to...
97    Well, I like to watch bad horror B-Movies, cau...
98    This IS the worst movie I have ever seen, as w...
Name: review, Length: 99, dtype: object


In [47]:
# Convert labels to 0/1
y = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0).to_numpy()

# Convert cleaned reviews to token index arrays
X_token_indices = reviews_to_token_indices(df["cleaned"].tolist(), vocab)

In [48]:
print(X_token_indices)
print(y)

[array([2763, 3322, 2479, 4366, 2815, 1286, 2317, 1869, 3341, 1323, 1757,
       1476, 4009, 3825, 2815,  509, 4205, 3447, 4307, 3522, 3341, 4453,
       1648, 4150, 3572, 1389, 1793, 4052, 3572, 3098, 3101, 3222, 1161,
       3529, 4307, 1764,  683, 4251, 4453,  550, 2815, 2673, 1629, 2788,
       2444, 3483, 3766, 2880, 1504, 2384, 1232,  676, 1351, 3482, 3045,
        609, 1636, 1560, 1376, 2065, 3047, 1825,   84, 1229,  676, 1856,
       2410,  229, 2603, 1588, 2225,  661, 2079, 2067, 3472,  968, 3758,
       1120,  965, 3535,   92, 2661, 1407,  286, 4474, 3433, 2382,  190,
       3572, 1170, 1379, 1653, 3578, 4475,  942, 1521, 3034, 2927, 2827,
       2385,  266, 1521,  635, 1521, 3369, 2815, 1121, 2483,  216, 1476,
       1286, 1310, 3432, 3825, 2626, 3893,  831, 3433, 3180, 4365, 1037,
       3947, 2815, 1668,   32, 1825, 2272, 1687, 4307, 4307, 2009,  890,
       1714, 2317, 3671, 2672, 2011, 2317, 2170, 2779, 1615,  286, 4390,
       2408, 2493,  682, 2011, 4159, 3045,  392, 1

## Step 3: Compute TF (Term Frequencies) on GPU
1. CUDA kernel to count token frequencies.
2. Allocate GPU memory
3. Transfer token arrays to GPU
4. Launche the kernel
5. Copies results back to CPU

In [49]:
# Write a CUDA kernel to count token frequencies
mod = SourceModule("""
__global__ void compute_tf(int *tokens, int *counts, int num_tokens, int vocab_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= num_tokens)
        return;

    int token = tokens[idx];
    if (token >= 0 && token < vocab_size) {
        atomicAdd(&counts[token], 1);
    }
}
""")

In [50]:
def compute_tf(tokens: np.ndarray, vocab_size: int) -> np.ndarray:
    """
    Compute term frequencies for one review.
    tokens: array of token indices (int32)
    vocab_size: size of vocab
    Returns: frequency array (int32)
    """
    # Prepare output array
    counts_host = np.zeros(vocab_size, dtype=np.int32)

    # Allocate GPU memory
    tokens_gpu = cuda.mem_alloc(tokens.nbytes)
    counts_gpu = cuda.mem_alloc(counts_host.nbytes)

    # Copy tokens to GPU
    cuda.memcpy_htod(tokens_gpu, tokens)
    cuda.memcpy_htod(counts_gpu, counts_host)

    # Kernel function
    kernel = mod.get_function("compute_tf")

    # Launch kernel
    block_size = 256
    grid_size = int((tokens.size + block_size - 1) / block_size)

    kernel(
        tokens_gpu,
        counts_gpu,
        np.int32(tokens.size),
        np.int32(vocab_size),
        block=(block_size,1,1),
        grid=(grid_size,1)
    )

    # Copy results back
    cuda.memcpy_dtoh(counts_host, counts_gpu)

    return counts_host


In [54]:
import time

start_time = time.time()  # شروع زمان‌گیری

tf_vectors = []
print("X_token_indices size:",X_token_indices.count)

for tokens in X_token_indices:
    tf = compute_tf(tokens, vocab_size=len(vocab))
    tf_vectors.append(tf)

end_time = time.time()  # پایان زمان‌گیری

print(f"Total processing time: {end_time - start_time:.2f} seconds")

X_token_indices size: <built-in method count of list object at 0x7c4660cddac0>
Total processing time: 0.01 seconds


In [55]:
print(tf_vectors)

[array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([0,

## Step 4: Compute IDF (Inverse Document Frequencies) on GPU
1. Prepare the stacked TF matrix
2. Allocate GPU memory for input and output
3. CUDA kernel to count and calculate idf

In [56]:
all_tf = np.stack(tf_vectors)
num_docs, vocab_size = all_tf.shape
flat_tf = all_tf.flatten().astype(np.int32)

In [57]:
mod = SourceModule("""
__global__ void compute_idf(int *tf_matrix, float *idf, int num_docs, int vocab_size) {
    int word_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (word_idx >= vocab_size)
        return;

    int doc_count = 0;
    for (int doc = 0; doc < num_docs; ++doc) {
        int tf = tf_matrix[doc * vocab_size + word_idx];
        if (tf > 0) {
            doc_count++;
        }
    }

    idf[word_idx] = logf((float)num_docs / (1.0f + doc_count));
}
""")

In [58]:
def compute_idf(tf_matrix: np.ndarray) -> np.ndarray:
    """
    tf_matrix: 2D array shape (num_docs, vocab_size), int32
    Returns: IDF vector shape (vocab_size,), float32
    """
    num_docs, vocab_size = tf_matrix.shape
    flat_tf = tf_matrix.flatten()

    # Allocate memory
    tf_gpu = cuda.mem_alloc(flat_tf.nbytes)
    idf_gpu = cuda.mem_alloc(vocab_size * np.float32().nbytes)

    # Copy input
    cuda.memcpy_htod(tf_gpu, flat_tf)

    # Prepare output array
    idf_host = np.zeros(vocab_size, dtype=np.float32)

    # Kernel
    kernel = mod.get_function("compute_idf")

    # Launch kernel
    block_size = 256
    grid_size = int((vocab_size + block_size - 1) / block_size)

    kernel(
        tf_gpu,
        idf_gpu,
        np.int32(num_docs),
        np.int32(vocab_size),
        block=(block_size, 1, 1),
        grid=(grid_size, 1)
    )

    # Copy back result
    cuda.memcpy_dtoh(idf_host, idf_gpu)

    return idf_host

  globals().clear()


In [59]:
idf_vector = compute_idf(all_tf)

print("IDF shape:", idf_vector.shape)
print("IDF fist 10 samples:", idf_vector[:10])

IDF shape: (4526,)
IDF fist 10 samples: [3.9019728 3.9019728 3.4965076 3.4965076 3.9019728 3.4965076 3.9019728
 3.9019728 2.5156784 3.9019728]


## Step 5: Compute TF-IDF Vectors on GPU

In [60]:
mod = SourceModule("""
__global__ void compute_tfidf(
    int *tf_matrix,
    float *idf,
    float *tfidf,
    int num_docs,
    int vocab_size
) {
    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total_size = num_docs * vocab_size;

    if (global_idx >= total_size)
        return;

    int word_idx = global_idx % vocab_size;

    tfidf[global_idx] = tf_matrix[global_idx] * idf[word_idx];
}
""")

In [61]:
def compute_tfidf(tf_matrix: np.ndarray, idf: np.ndarray) -> np.ndarray:
    """
    tf_matrix: shape (num_docs, vocab_size), int32
    idf: shape (vocab_size,), float32
    Returns:
      tfidf_matrix: shape (num_docs, vocab_size), float32
    """
    num_docs, vocab_size = tf_matrix.shape
    flat_tf = tf_matrix.flatten()

    # Allocate input memory
    tf_gpu = cuda.mem_alloc(flat_tf.nbytes)
    idf_gpu = cuda.mem_alloc(idf.nbytes)

    # Prepare output memory
    tfidf_host = np.zeros(flat_tf.size, dtype=np.float32)
    tfidf_gpu = cuda.mem_alloc(tfidf_host.nbytes)

    # Copy inputs
    cuda.memcpy_htod(tf_gpu, flat_tf)
    cuda.memcpy_htod(idf_gpu, idf)

    # Get kernel
    kernel = mod.get_function("compute_tfidf")

    # Launch
    block_size = 256
    total_size = flat_tf.size
    grid_size = (total_size + block_size - 1) // block_size

    kernel(
        tf_gpu,
        idf_gpu,
        tfidf_gpu,
        np.int32(num_docs),
        np.int32(vocab_size),
        block=(block_size,1,1),
        grid=(grid_size,1)
    )

    # Copy back result
    cuda.memcpy_dtoh(tfidf_host, tfidf_gpu)

    # Reshape to (num_docs, vocab_size)
    tfidf_matrix = tfidf_host.reshape((num_docs, vocab_size))

    return tfidf_matrix

In [62]:
tfidf_matrix = compute_tfidf(all_tf, idf_vector)

print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Sample row:", tfidf_matrix[0, :10])

TF-IDF matrix shape: (99, 4526)
Sample row: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## Step 6: Train Logistic Regression Model on GPU

In [63]:
from pycuda.compiler import SourceModule

mod = SourceModule("""

__device__ float sigmoid(float z) {
    return 1.0f / (1.0f + expf(-z));
}

__global__ void train_logistic_regression(
    float *X,     // TF-IDF matrix, flattened
    int *y,       // labels
    float *weights,
    int num_docs,
    int vocab_size,
    float lr
) {
    int doc_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (doc_idx >= num_docs)
        return;

    // Compute dot product w·x
    float z = 0.0f;
    for (int j = 0; j < vocab_size; ++j) {
        z += weights[j] * X[doc_idx * vocab_size + j];
    }

    float pred = sigmoid(z);
    float error = (float)(y[doc_idx]) - pred;

    // Update weights
    for (int j = 0; j < vocab_size; ++j) {
        float grad = error * X[doc_idx * vocab_size + j];
        atomicAdd(&weights[j], lr * grad);
    }
}
""")

In [64]:
def train_logistic_regression(
    X: np.ndarray,
    y: np.ndarray,
    epochs=10,
    lr=0.01
) -> np.ndarray:
    """
    X: (num_docs, vocab_size), float32
    y: (num_docs,), int32
    Returns: weights vector (vocab_size,)
    """
    num_docs, vocab_size = X.shape
    flat_X = X.flatten()

    # Allocate device memory
    X_gpu = cuda.mem_alloc(flat_X.nbytes)
    y_gpu = cuda.mem_alloc(y.nbytes)
    weights_gpu = cuda.mem_alloc(vocab_size * np.float32().nbytes)

    # Initialize weights to zero
    weights_host = np.zeros(vocab_size, dtype=np.float32)

    # Copy inputs
    cuda.memcpy_htod(X_gpu, flat_X)
    cuda.memcpy_htod(y_gpu, y)
    cuda.memcpy_htod(weights_gpu, weights_host)

    # Get kernel
    kernel = mod.get_function("train_logistic_regression")

    # Launch configuration
    block_size = 128
    grid_size = (num_docs + block_size - 1) // block_size

    # Run epochs
    for epoch in range(epochs):
        kernel(
            X_gpu,
            y_gpu,
            weights_gpu,
            np.int32(num_docs),
            np.int32(vocab_size),
            np.float32(lr),
            block=(block_size,1,1),
            grid=(grid_size,1)
        )

    # Copy back weights
    cuda.memcpy_dtoh(weights_host, weights_gpu)

    return weights_host

  globals().clear()


In [65]:
import time

start_time = time.time()  # شروع زمان‌گیری

weights = train_logistic_regression(
    X=tfidf_matrix.astype(np.float32),
    y=y.astype(np.int32),
    epochs=2000,
    lr=0.01
)

end_time = time.time()  # پایان زمان‌گیری

print("Trained weights shape:", weights.shape)
print("Sample weights:", weights[:10])

print(f"Total processing time: {end_time - start_time:.2f} seconds")

Trained weights shape: (4526,)
Sample weights: [ 0.01951642 -0.03250081  0.03496508  0.03496508  0.01950986 -0.07310514
 -0.06583448 -0.02119818  0.09101294  0.01950986]
Total processing time: 2.12 seconds


## Step 7: Evaluate Accuracy


In [66]:
def sigmoid_np(z):
    return 1 / (1 + np.exp(-z))

def predict_probs_np(X: np.ndarray, weights: np.ndarray) -> np.ndarray:
    z = X @ weights  # matrix multiplication
    return sigmoid_np(z)

def predict_labels_np(X: np.ndarray, weights: np.ndarray) -> np.ndarray:
    probs = predict_probs_np(X, weights)
    return (probs >= 0.5).astype(int)

def calculate_accuracy_np(X: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
    preds = predict_labels_np(X, weights)
    accuracy = np.mean(preds == y) * 100.0
    return accuracy

# استفاده:
acc = calculate_accuracy_np(tfidf_matrix.astype(np.float32), y.astype(np.int32), weights)
print(f"Training Accuracy: {acc:.2f}%")


Training Accuracy: 100.00%


## Step 8: Interactive Prediction

In [None]:
def interactive_prediction_np(reviews: List[str], X: np.ndarray, y: np.ndarray, weights: np.ndarray):
    """
    reviews: لیست جملات خام (متن‌های بررسی‌شده)
    X: ماتریس TF-IDF به صورت NumPy (num_docs, vocab_size)
    y: لیبل‌ها به صورت NumPy (num_docs,)
    weights: وزن‌های آموزش‌دیده (vocab_size,)
    """
    while True:
        try:
            idx = int(input(f"Enter review index (0 to {len(reviews) - 1}): "))
            if not (0 <= idx < len(reviews)):
                print("❌ Index out of range.")
                continue

            z = np.dot(X[idx], weights)
            prob = 1 / (1 + np.exp(-z))
            prediction = int(prob >= 0.5)

            print("\nReview:")
            print(reviews[idx])
            print("\nPredicted sentiment:", "positive" if prediction == 1 else "negative")
            print("Actual sentiment   :", "positive" if y[idx] == 1 else "negative")

            if prediction != y[idx]:
                print("⚠️ Wrong prediction!")

            print("-" * 50)

        except Exception as e:
            print("Invalid input:", e)

interactive_prediction_np(df["review"], tfidf_matrix.astype(np.float32), y.astype(np.int32), weights)


Enter review index (0 to 98): 0

Review:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the 