In [1]:
import re, html
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
def load_unsup(imdb_path = "aclImdb", max_=None):
    texts = []

    path = os.path.join(imdb_path, "train/unsup")
    files = os.listdir(path)
    if max_:
        files = files[:max_]
    for file in files:
        with open(os.path.join(path, file), encoding='utf-8') as f:
            texts.append(f.read())

    return texts

In [3]:
def preprocess(reviews):
    vectorizer = CountVectorizer()

    preprocess_reviews = []
    for review in reviews:
        text = re.sub(r'<[^>]+>', '', review)              # HTML tags
        text = re.sub(r'\b\d+\b', '', html.unescape(text))  # Entities + numbers
        text = re.sub(r'\s+', ' ', re.sub(r'[\W_]+', ' ', text)).strip()  # Symbols + trim
        preprocess_reviews.append(text)

    vectorizer.fit(preprocess_reviews)

    return vectorizer.get_feature_names_out()

In [4]:
def load_data(imdb_path = "aclImdb", split="train", max_per_class=None):
    texts = []
    labels = []
    
    for label in ['pos', 'neg']:
        path = os.path.join(imdb_path, split, label)
        files = os.listdir(path)
        if max_per_class:
            files = files[:max_per_class]
        for file in files:
            with open(os.path.join(path, file), encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1 if label == 'pos' else 0)
    
    return texts, labels

In [None]:
class SemanticSimilaritiesModel:
    def __init__(self, beta, V, R, b, theta, lambd=0.1, nu=0.1):
        self.beta = beta
        self.V = V
        self.R = R
        self.b = b
        self.theta = theta
        self.lambd = lambd
        self.nu = nu

    def compute_energy(self, w, theta, phi_w, b_w):
        return -np.dot(theta, phi_w) - b_w

    def compute_word_probabilities(self, theta, R, b):
        energy = np.dot(R.T, theta) + b
        energy_max = np.max(energy)
        log_probabilities = energy - (np.log(np.sum(np.exp(energy - energy_max))) + energy_max.squeeze())
        return np.exp(log_probabilities)

    def compute_document_probability(self, document):
        word_indices = [list(self.V).index(word.lower()) for word in document.lower().split() if word.lower() in self.V]
        word_probabilities = [self.compute_word_probabilities(self.theta, self.R[:, i], self.b[i])
                              for i in word_indices]
        return np.prod(word_probabilities)

    def compute_log_likelihood(self, documents):
        log_likelihood = 0.0
        for document in documents:
            log_likelihood += np.sum(np.log(self.compute_document_probability(document)))
        return log_likelihood

    def compute_regularization_term(self):
        return self.nu * np.linalg.norm(self.R) ** 2

    def compute_objective(self, documents):
        log_likelihood = self.compute_log_likelihood(documents)
        regularization_term = self.compute_regularization_term()
        return log_likelihood + regularization_term

    def optimize_parameters(self, documents, batch_size=32, max_iterations=100, learning_rate=0.1):
        num_documents = len(documents)
        num_batches = (num_documents + batch_size - 1) // batch_size

        # Precompute word indices and probabilities for all documents
        word_indices = [[list(self.V).index(word.lower()) for word in document.split() if word.lower() in self.V]
                        for document in documents]
        word_probabilities = [[self.compute_word_probabilities(self.theta, self.R[:, i], self.b[i])
                                for i in indices] for indices in word_indices]

        for iteration in range(max_iterations):
            total_gradient_R = np.zeros((self.beta, len(self.V)))
            total_gradient_b = np.zeros(len(self.V))
            total_gradient_theta = np.zeros(self.beta)

            for batch in range(num_batches):
                start_idx = batch * batch_size
                end_idx = min(start_idx + batch_size, num_documents)
                batch_documents = documents[start_idx:end_idx]
                batch_word_indices = word_indices[start_idx:end_idx]
                batch_word_probabilities = word_probabilities[start_idx:end_idx]

                gradient_R = np.zeros((self.beta, len(self.V)))
                gradient_b = np.zeros(len(self.V))
                gradient_theta = np.zeros(self.beta)

                for doc_indices, doc_probs in zip(batch_word_indices, batch_word_probabilities):
                    unique_doc_indices = np.unique(doc_indices)
                    gradients = self.compute_gradients(unique_doc_indices, doc_probs)
                    mask = np.zeros((self.beta, len(self.V)))
                    mask[:, unique_doc_indices] = 1
                    gradient_R += gradients[0] * mask
                    gradient_b += gradients[1] * mask[0]
                    gradient_theta += gradients[2]


                total_gradient_R += gradient_R
                total_gradient_b += gradient_b
                total_gradient_theta += gradient_theta
                print("batch: ",batch)

            self.R -= learning_rate * (total_gradient_R + 2 * self.nu * self.R)
            self.b -= learning_rate * total_gradient_b
            self.theta -= learning_rate * (total_gradient_theta + 2 * self.lambd * self.theta)

            objective = self.compute_objective(documents)
            print(f"Iteration {iteration+1}: Objective = {objective}")

    def compute_gradients(self, word_indices, word_probabilities):
        gradient_R = np.zeros((self.beta, len(self.V)))
        gradient_b = np.zeros(len(self.V))
        

        gradient_theta = np.zeros(self.beta)

        for indices, prob_w in zip(word_indices, word_probabilities):
            phi_w = self.R[:, indices]
            b_w = self.b[indices]
            gradient_R[:, indices] += (prob_w - 1) * self.theta
            gradient_b[indices] += prob_w - 1
            gradient_theta += prob_w * phi_w
        return gradient_R, gradient_b, gradient_theta


    def compute_document_similarity(self, document1, document2):
        representation1 = self.compute_document_representation(document1)
        representation2 = self.compute_document_representation(document2)
        similarity = np.dot(representation1, representation2) / (np.linalg.norm(representation1) * np.linalg.norm(representation2))
        return similarity
    
  


    def compute_document_representation(self, document):
        word_indices = [list(self.V).index(word.lower()) for word in document.split() if word.lower() in self.V]
        document_representation = np.zeros(self.beta)
        for word_index in word_indices:
            phi_w = self.R[:, word_index]
            b_w = self.b[word_index]
            word_probabilities = self.compute_word_probabilities(self.theta, phi_w, b_w)
            document_representation += word_probabilities * phi_w
        return document_representation

In [6]:
def predict_sentiment(review, model):
    words = review.split()
    probabilities = []
    for word in words:
        try:
            word_index = list(model.V).index(word)
            phi_w = model.R[:, word_index]
            b_w = model.b[word_index]
            probability = model.compute_word_probabilities(model.theta, phi_w, b_w)
            probabilities.append(probability)
        except ValueError:
            # Handle out-of-vocabulary words
            probabilities.append(0.5)  # Assume equal probability for unknown words

    average_probability = np.mean(probabilities)
    if average_probability >= 0.5:
        return 1
    else:
        return 0

In [7]:
unsup = load_unsup(max_=1000)
V = preprocess(unsup)
beta = 100 # Dimension
R = np.random.randn(beta, len(V))
b = np.random.randn(len(V))
theta = np.random.randn(beta)

model = SemanticSimilaritiesModel(beta, V, R, b, theta)
model.optimize_parameters(unsup, batch_size=32, max_iterations=3, learning_rate=0.1)


batch:  0
batch:  1
batch:  2
batch:  3
batch:  4
batch:  5
batch:  6
batch:  7
batch:  8
batch:  9
batch:  10
batch:  11
batch:  12
batch:  13
batch:  14
batch:  15
batch:  16
batch:  17
batch:  18
batch:  19
batch:  20
batch:  21
batch:  22
batch:  23
batch:  24
batch:  25
batch:  26
batch:  27
batch:  28
batch:  29
batch:  30
batch:  31
Iteration 1: Objective = 150611.9437886272
batch:  0
batch:  1
batch:  2
batch:  3
batch:  4
batch:  5
batch:  6
batch:  7
batch:  8
batch:  9
batch:  10
batch:  11
batch:  12
batch:  13
batch:  14
batch:  15
batch:  16
batch:  17
batch:  18
batch:  19
batch:  20
batch:  21
batch:  22
batch:  23
batch:  24
batch:  25
batch:  26
batch:  27
batch:  28
batch:  29
batch:  30
batch:  31
Iteration 2: Objective = 144647.71081459738
batch:  0
batch:  1
batch:  2
batch:  3
batch:  4
batch:  5
batch:  6
batch:  7
batch:  8
batch:  9
batch:  10
batch:  11
batch:  12
batch:  13
batch:  14
batch:  15
batch:  16
batch:  17
batch:  18
batch:  19
batch:  20
batch:  

In [9]:
texts_test, labels_test = load_data(split="test", max_per_class=1000)

predictions = [predict_sentiment(text, model) for text in texts_test]

accuracy = accuracy_score(labels_test, predictions)
print('Accuracy:', accuracy)

confusion = confusion_matrix(labels_test, predictions)
print('Confusion Matrix:')
print(confusion)

classification = classification_report(labels_test, predictions)
print('Classification Report:')
print(classification)

Accuracy: 0.5
Confusion Matrix:
[[   0 1000]
 [   0 1000]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1000
           1       0.50      1.00      0.67      1000

    accuracy                           0.50      2000
   macro avg       0.25      0.50      0.33      2000
weighted avg       0.25      0.50      0.33      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
model.compute_document_similarity("I like dogs", "I like cats")

0.4663831626886387

In [13]:
model.compute_document_similarity("I like dogs", "I love dogs")

0.4771806209824731

In [14]:
model.compute_document_similarity("I like dogs", "I hate dogs")

0.49200910253178176

In [15]:
model.compute_document_similarity("I like dogs", "I 'm kevin")

-0.16513807225002863