In [38]:
from datasets import load_dataset
import numpy as np
from collections import defaultdict
dataset = load_dataset("sst", "default")

In [39]:
#naive bayes
def train_naive_bayes(D, C):
    N_doc = len(D)
    logprior = {}
    loglikelihood = {}
    V = set()
    bigdoc = defaultdict(list)
    word_counts_per_class = defaultdict(lambda: defaultdict(int))

    # Calculate logprior and collect documents for each class
    for c in C:
        D_c = [doc_tokens for doc_tokens, cl in D if cl == c]
        N_c = len(D_c)
        logprior[c] = np.log(N_c / N_doc)
        for doc in D_c:
            bigdoc[c] += doc
            V.update(doc)
            for word in doc:
                word_counts_per_class[word][c] += 1

    V = list(V)

    # Calculate loglikelihood
    for w in V:
        for c in C:
            count_w_c = word_counts_per_class[w][c]
            loglikelihood[(w, c)] = np.log((count_w_c + 1) / (sum(word_counts_per_class[w].values()) + len(V)))

    return logprior, loglikelihood, V

def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    sum_scores = {c: logprior[c] for c in C}
    for word in testdoc:
        if word in V:
            for c in C:
                sum_scores[c] += loglikelihood.get((word, c), 0)
    return max(sum_scores, key=sum_scores.get)

In [40]:
classes = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive",
}
def map_dataset(data):
    if data <= 0.2:
        return 0  # very negative
    elif data <= 0.4:
        return 1  # negative
    elif data <= 0.6:
        return 2  # neutral
    elif data <= 0.8:
        return 3  # positive
    else:
        return 4  # very positive
def data_to_tokens(data):
    documents = []
    for entry in data:
        tokens = entry['tokens'].split("|")
        label = map_dataset(entry['label'])
        documents.append((tokens, label))
    return documents

In [41]:
# Prepare training and testing data
train_documents = data_to_tokens(dataset['train'])
test_documents = data_to_tokens(dataset['test'])

In [42]:
# Train Naive Bayes classifier
logprior, loglikelihood, vocab_list = train_naive_bayes(train_documents, [0, 1, 2, 3, 4])

# Test the classifier on the test dataset
correct_predictions = 0
total_predictions = len(test_documents)

for doc_tokens, actual_class in test_documents:
    predicted_class = test_naive_bayes(doc_tokens, logprior, loglikelihood, [0, 1, 2, 3, 4], vocab_list)
    if predicted_class == actual_class:
        correct_predictions += 1

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print("Accuracy:", accuracy)

# Test the classifier on a sample document
random_phrase = "This movie was fantastic! I loved every minute of it."
random_score = np.random.uniform(0, 1)
actual_class = map_dataset(random_score)
predicted_class = test_naive_bayes(random_phrase.split(), logprior, loglikelihood, [0, 1, 2, 3, 4], vocab_list)
print("Random phrase:", random_phrase)
print("Predicted class:", predicted_class)
print("Actual class:", actual_class)

Accuracy: 0.36742081447963804
Random phrase: This movie was fantastic! I loved every minute of it.
Predicted class: 1
Actual class: 3


In [43]:
#comparison with sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Prepare training and testing data
train_texts = [' '.join(tokens) for tokens, _ in train_documents]
test_texts = [' '.join(tokens) for tokens, _ in test_documents]
train_labels = [label for _, label in train_documents]
test_labels = [label for _, label in test_documents]

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB(alpha=1.0))
])

# Train the classifier
pipeline.fit(train_texts, train_labels)

# Predict on the test set
predicted_labels = pipeline.predict(test_texts)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)
print("Accuracy using scikit-learn:", accuracy)


Accuracy using scikit-learn: 0.4090497737556561


In [44]:
#logistic regression
# Feature representation
def generate_bigrams_from_dataset(dataset):
    bigrams = set()
    for example in dataset:
        tokens = example['tokens']
        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i+1])
            bigrams.add(bigram)
    return sorted(list(bigrams))

def generate_features_from_dataset(dataset, bigrams):
    features = np.zeros((len(dataset), len(bigrams)), dtype=int)
    for i, example in enumerate(dataset):
        tokens = example['tokens']
        for j, bigram in enumerate(bigrams):
            if bigram in zip(tokens, tokens[1:]):
                features[i, j] = 1
    return features


# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


In [45]:
# Model Training
def train_logistic_regression(X, y, learning_rate, num_iterations):
    m, n = X.shape
    theta = np.zeros((n, 1))
    for i in range(num_iterations):
        z = np.dot(X, theta)
        h = sigmoid(z)
        gradient = np.dot(X.T, (h - y)) / m
        theta -= learning_rate * gradient
    return theta

In [46]:
# Prediction
def predict(X, theta):
    z = np.dot(X, theta)
    h = sigmoid(z)
    return (h >= 0.5).astype(int)

# Evaluation
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)


In [47]:
train_data = dataset["train"]
test_data = dataset["test"]
# Generate bigrams
bigrams = generate_bigrams_from_dataset(train_data)
# Generate features
X_train = generate_features_from_dataset(train_data, bigrams)
y_train = np.array(train_data['label']).reshape(-1, 1)
X_test = generate_features_from_dataset(test_data, bigrams)
y_test = np.array(test_data['label']).reshape(-1, 1)
# Train logistic regression
learning_rate = 0.01
num_iterations = 1000
theta = train_logistic_regression(X_train, y_train, learning_rate, num_iterations)

# Evaluate on test set
y_pred = predict(X_test, theta)
accuracy_score = accuracy(y_test, y_pred)
print("Test accuracy:", accuracy_score)

Test accuracy: 0.0027149321266968325


In [49]:
#sklearn comparison
from sklearn.linear_model import LogisticRegression

# Train logistic regression using scikit-learn
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_sklearn = lr.predict(X_test)

# Evaluate scikit-learn model accuracy
accuracy_score_sklearn = accuracy(y_true, y_pred_sklearn)
print("Accuracy of scikit-learn logistic regression:", accuracy_score_sklearn)


  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.