Sequential Text Semantics Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
data = pd.read_csv('IMDB Dataset Preprocessed - 100K.csv')

In [None]:
# Define the SVM gradient function
def svm_grad(w, X, y, reg):
    margin = y * np.dot(X, w) # calculate margin of sample using dot prod. of X and w
    misclassified = margin < 1
    grad = np.zeros_like(w) # init. gradient vectors as all zeros
    grad += reg * 2 * w # overfitting rokega by penalizing large weights
    X = np.array(X)
    y = np.array(y)
    grad -= np.mean(X[misclassified] * y[misclassified, np.newaxis].astype(float), axis=0) # basically finds the best hyperplane by subtracting mean of misclassified samples from gradient
    return grad

In [None]:
from sklearn.model_selection import train_test_split
print(data.shape)
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.4, random_state=42)

In [None]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert the reviews into feature vectors
vectorizer = CountVectorizer(stop_words='english', max_features=1000) # removes eng. stop words ("a", "the", '"and") and top 1000 most freq. words based on their occurence
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [None]:
# Convert the labels into integers , assigns 'positive' val. of 1 and 'negative' val. of -1
y_train = (y_train == 'positive').astype(int) * 2 - 1
y_test = (y_test == 'positive').astype(int) * 2 - 1

In [None]:
# Initialize the weights
w = np.zeros(X_train.shape[1])

In [None]:
# Set the hyperparameters
reg = 1e-5            # regularization param. jitna chota hoga utna ziada complex model and behtar accuracy
learning_rate = 0.01  # determines the step size the algo takes to converge to optimal soln.
num_iterations = 1000

In [None]:
import time

start_time = time.time()

# trains SVM using grad. descent
for i in range(num_iterations):
    gradient = svm_grad(w, X_train, y_train, reg)
    w -= learning_rate * gradient # determines size of weight update in each iteration

end_time = time.time()

total_time = end_time - start_time

print('Average execution time: {}'.format(total_time / 1000))
print('Total time: {}'.format(total_time))

Average execution time: 0.43814740681648257
Total time: 438.14740681648254


In [None]:
# Predict on the test set
y_pred = np.sign(np.dot(X_test, w))

In [None]:
# Calculate evaluation metrics
accuracy = np.mean(y_pred == y_test)
precision = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_pred == 1)
recall = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_test == 1)
f1_score = 2 * precision * recall / (precision + recall)

In [None]:
# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1_score)

Accuracy: 0.8364968102701426
Precision: 0.8202594659828029
Recall: 0.8600917431192661
F1-score: 0.8397034977993978


CUDA Text Semantics Analysis

In [None]:
from numba import cuda
import numba
from numba import types
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
data = pd.read_csv('IMDB Dataset Preprocessed.csv')

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.4, random_state=42)

In [None]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert the reviews into feature vectors
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
X_train = X_train.astype(float)
X_test = X_test.astype(float)

In [None]:
# Convert the labels into integers
y_train = (y_train == 'positive').astype(int) * 2 - 1
y_test = (y_test == 'positive').astype(int) * 2 - 1

In [None]:
# Set the hyperparameters
w2 = np.zeros(X_train.shape[1])
reg2 = 1e-5
learning_rate2 = 0.01
num_iterations2 = 1000

In [None]:
import numba
from numba import cuda
@numba.cuda.jit
def svm_grad2_kernel(w, X, y, result, margin):
    tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
    ty = cuda.blockIdx.x  # Similarly, this is the unique block ID within the 1D grid

    block_size = cuda.blockDim.x  # number of threads per block
    grid_size = cuda.gridDim.x    # number of blocks in the grid
    g = cuda.cg.this_grid()

    start = tx + ty * block_size
    stride = block_size * grid_size

    for j in range(start, len(X), stride):
        margin[j] = 0.0
        for k in range(len(X[j])):
            margin[j] += X[j][k] * w[k]
        margin[j] *= y[j]
        if margin[j] < 1:
            result[j] = True
        else:
            result[j] = False
    g.sync()

@numba.cuda.jit
def svm_grad2_kernel2(w, reg, grad):
    tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
    ty = cuda.blockIdx.x  # Similarly, this is the unique block ID within the 1D grid

    block_size = cuda.blockDim.x  # number of threads per block
    grid_size = cuda.gridDim.x    # number of blocks in the grid
    g = cuda.cg.this_grid()

    start = tx + ty * block_size # to traverse each thread in a block
    stride = block_size * grid_size

    for j in range(start, len(grad), stride):
      grad[j] += w[j]
    g.sync()

@numba.cuda.jit #ADDED THIS PART AFTER RUNTIME EXPIRED
def svm_grad2_kernel3(grad, mean):
    tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
    ty = cuda.blockIdx.x  # Similarly, this is the unique block ID within the 1D grid

    block_size = cuda.blockDim.x  # number of threads per block
    grid_size = cuda.gridDim.x    # number of blocks in the grid
    g = cuda.cg.this_grid()

    start = tx + ty * block_size # to traverse each thread in a block
    stride = block_size * grid_size

    for j in range(start, len(grad), stride):
        grad[j] -= mean[j]
    g.sync()

def svm_grad2(w, X, y, reg):
    grad = np.empty(len(w), dtype=np.float32)
    result = np.empty((len(X),), dtype=np.bool_)
    margin = np.empty((len(X),), dtype=np.float32)
    threadsperblock = 128
    blockspergrid = (len(X) + threadsperblock - 1) // threadsperblock

    d_w = cuda.to_device(w)
    d_X = cuda.to_device(X)
    d_y = cuda.to_device(y)
    d_grad = cuda.to_device(grad)
    d_reg = cuda.to_device(reg)
    d_margin = cuda.to_device(margin)

    d_result = cuda.to_device(result)

    svm_grad2_kernel[blockspergrid, threadsperblock](d_w, d_X, d_y, d_result, d_margin)
    cuda.synchronize()

    d_result.copy_to_host(result)

    blockspergrid2 = (len(w) + threadsperblock - 1) // threadsperblock #changed this after runtime expired
    Wnew = reg * 2.0 * (w.astype(np.float64))

    d_Wnew = cuda.to_device(Wnew)

    svm_grad2_kernel2[blockspergrid2, threadsperblock](d_Wnew, d_reg, d_grad) #put blockspergrid2 here
    cuda.synchronize()

    d_grad.copy_to_host(grad)

    X = np.array(X)
    y = np.array(y)

    mean = np.mean(X[result] * y[result, np.newaxis].astype(float), axis=0) #AFTER RUNTIME EXPIRED

    d_mean = cuda.to_device(mean) #AFTER RUNTIME EXPIRED

    blockspergrid3 = (len(mean) + threadsperblock - 1) // threadsperblock #changed this after runtime expired
    svm_grad2_kernel3[blockspergrid3, threadsperblock](d_grad, d_mean)
    cuda.synchronize()
    d_grad.copy_to_host(grad)

    #grad -= np.mean(X[result] * y[result, np.newaxis].astype(float), axis=0) #Remove # if kernel3 doesn't work

    return grad

In [None]:
import time

start_time = time.time()
# Train the SVM model using gradient descent
for i in range(num_iterations2):
    grad1 = svm_grad2(w2, X_train, y_train, reg2)
    w2 -= learning_rate2 * grad1
end_time = time.time()

total_time = end_time - start_time

print('Average execution time: {}'.format(total_time / 1000))
print('Total time: {}'.format(total_time))



Average execution time: 0.28316290283203127
Total time: 283.16290283203125


In [None]:
# Predict on the test set
y_pred = np.sign(np.dot(X_test, w2))

In [None]:
# Calculate evaluation metrics
accuracy = np.mean(y_pred == y_test)
precision = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_pred == 1)
recall = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_test == 1)
f1_score = 2 * precision * recall / (precision + recall)

In [None]:
# Print the evaluation metrics
print('Accuracy:', accuracy)

Accuracy: 0.7964
