# Download dataset

In [1]:
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.datasets import imdb

# Load the IMDB dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=20000)

# Decode back to text
word_index = imdb.get_word_index()
index_word = {index: word for word, index in word_index.items()}

def decode_review(encoded_review):
    return ' '.join([index_word.get(i - 3, '?') for i in encoded_review])

# Convert integer sequences back to text
X_train_text = [decode_review(review) for review in X_train]
X_test_text = [decode_review(review) for review in X_test]

# Tokenize the training data (split words)
tokenized_train = [review.split() for review in X_train_text]

2025-11-24 12:10:07.931916: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-24 12:10:07.978360: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-24 12:10:08.848029: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# Augment dataset

In [None]:
# Augment training data by replacing words with their most similar words
import pickle
from tqdm import tqdm
from tools import Tools
import os

def precompute_alternatives(vectorizer_X, omni_embeddings):
    """Precompute and cache alternatives for all words based on their embeddings and labels."""
    import random
    import numpy as np

    alternatives_cache = {}

    for word, id in tqdm(vectorizer_X.vocabulary_.items(), desc="Precomputing alternatives"):
        embedding = omni_embeddings.get(id, None)
        if embedding is None:
            continue

        # Convert embedding to a NumPy array for faster operations
        embedding = np.array(embedding)

        # Compute alternatives for label 1 (descending order)
        top_indices_label_1 = np.argpartition(-embedding, 400)[:400]  # Get indices of top 400 largest values
        top_indices_label_1 = top_indices_label_1[np.argsort(-embedding[top_indices_label_1])]  # Sort top 400
        top_features_label_1 = [vectorizer_X.get_feature_names_out()[idx] for idx in top_indices_label_1[:5]]
        selected_feature_label_1 = random.choice(top_features_label_1)  # Randomly select one from the top 5

        # Compute alternatives for label 0 (ascending order)
        top_indices_label_0 = np.argpartition(embedding, 400)[:400]  # Get indices of top 400 smallest values
        top_indices_label_0 = top_indices_label_0[np.argsort(embedding[top_indices_label_0])]  # Sort top 400
        top_features_label_0 = [vectorizer_X.get_feature_names_out()[idx] for idx in top_indices_label_0[:5]]
        selected_feature_label_0 = random.choice(top_features_label_0)  # Randomly select one from the top 5

        # Cache the alternatives for both labels
        alternatives_cache[word] = {
            1: selected_feature_label_1,
            0: selected_feature_label_0
        }

    return alternatives_cache

vectorizer_X = Tools.read_pickle_data("vectorizer_X.pickle")
number_of_features = vectorizer_X.get_feature_names_out().shape[0]
feature_names = vectorizer_X.get_feature_names_out()
omni_embeddings = Tools.read_pickle_data("omni_embeddings.pickle")
alternatives_cache_path = 'omni_alternatives_cache.pickle'
if not os.path.exists(alternatives_cache_path):
    print("Precomputing alternatives for all words...")
    alternatives_cache = precompute_alternatives(vectorizer_X, omni_embeddings)
    with open(alternatives_cache_path, 'wb') as f:
        pickle.dump(alternatives_cache, f)
else:
    alternatives_cache = Tools.read_pickle_data(alternatives_cache_path)


def perturb_document(doc, percent, label, alternatives_cache):
    """Augment a document by replacing words with their precomputed alternatives."""
    tokens = doc.split()
    num_words_to_change = len(tokens) * (percent / 100)
    words_changed = 0
    indices_to_change = set(random.sample(range(len(tokens)), int(num_words_to_change)))

    for i in range(len(tokens)):
        if i in indices_to_change:
            original_word = tokens[i]
            if original_word in alternatives_cache and label in alternatives_cache[original_word]:
                augmented_word = alternatives_cache[original_word][label]
                if augmented_word != original_word:
                    tokens[i] = augmented_word
                    words_changed += 1

    return ' '.join(tokens)

percent_to_change = 5
X_train_perturbed = []
for r, doc in enumerate(tqdm(X_train_text)):
    perturbed_doc = perturb_document(doc, percent_to_change, y_train[r], alternatives_cache)
    X_train_perturbed.append(perturbed_doc)
    if r > 5:
        break
with open('perturbed_documents.pickle', 'wb') as f:
    pickle.dump((X_train_perturbed, X_train, y_train, X_test, y_test), f)

100%|██████████| 25000/25000 [00:00<00:00, 39875.59it/s]


In [3]:
with open('perturbed_documents.pickle', 'wb') as f:
    pickle.dump((X_train_perturbed, X_train, y_train, X_test, y_test), f)

# Classify dataset

In [4]:
# Use CountVectorizer to convert text to vectors
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_perturbed)
X_test_vec = vectorizer.transform(X_test_text)

### RandomForestClassifier

In [5]:
# Train the classifier on augmented data
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_vec, y_train)

# Predict and evaluate on original test data
y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8388


### LogisticRegression

In [6]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=42)
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (Logistic Regression): {accuracy:.4f}")


Test Accuracy (Logistic Regression): 0.8502


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Naive Bayes

In [7]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (Naive Bayes): {accuracy:.4f}")

Test Accuracy (Naive Bayes): 0.8169


### SVM

In [8]:
from sklearn.svm import LinearSVC

classifier = LinearSVC(random_state=42)
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (SVM): {accuracy:.4f}")

Test Accuracy (SVM): 0.8290


### MLP

In [9]:
from sklearn.neural_network import MLPClassifier

classifier = MLPClassifier(random_state=42)
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (MLP): {accuracy:.4f}")

Test Accuracy (MLP): 0.8488


### TM

In [None]:
from tmu.models.classification.vanilla_classifier import TMClassifier
import numpy as np
import pickle

saved = open("perturbed_documents.pickle", "rb")
X_train_perturbed, X_train, y_train, X_test, y_test = pickle.load(saved)
saved.close()

# Convert labels to one-hot encoding for Tsetlin Machine
X_train_tm = np.array(X_train_vec.toarray(), dtype=np.uint32)
Y_train_tm = y_train.astype(np.uint32)

X_test_tm = np.array(X_test_vec.toarray(), dtype=np.uint32)
Y_test_tm = y_test.astype(np.uint32)

# with open('imdb_tm_classifer.pickle', 'wb') as f:
#     pickle.dump((X_train, Y_train, X_test, Y_test), f)
        
# saved = open("imdb_tm_classifer.pickle", "rb")
# X_train, Y_train, X_test, Y_test = pickle.load(saved)
# saved.close()

num_clauses = 1000
T = 8000
s = 2.0
device = "CPU"
weighted_clauses = True
epochs = 10
clause_drop_p = 0.75

print("started")
tm = TMClassifier(num_clauses, T, s, platform=device, weighted_clauses=weighted_clauses,clause_drop_p=clause_drop_p)
for epoch in range(epochs):
    tm.fit(X_train_tm, Y_train_tm)
    result = 100 * (tm.predict(X_test_tm) == Y_test_tm).mean()
    print(f"Accuracy: {result:.2f}")

started
