## [***Rule-based Sentiment Classifier***](https://github.com/neubig/anlp-code/tree/main/data)

In [1]:
# Import train, test, dev data
import os
from pathlib import Path
base_dir = Path("/home/amiagarw/01-Programming/Python/Data/sst-sentiment-text-threeclass/")

# Paths to the specific files
dev = base_dir / "dev.txt"
test = base_dir / "test.txt"
train = base_dir / "train.txt"

In [2]:
# Read in the data from the training and dev (or test) sets
def read_xy_data(filename: str) -> tuple[list[str], list[int]]:
    x_data = []
    y_data = []
    with open(filename, 'r') as f:
        for line in f:
            label, text = line.strip().split(' ||| ')
            x_data.append(text)
            y_data.append(int(label))
    return x_data, y_data


In [3]:
x_train, y_train = read_xy_data(train)
x_test, y_test = read_xy_data(dev)

In [4]:
print(f"x_train: {x_train[0]}")
print(f"y_train: {y_train[0]}")

x_train: The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .
y_train: 1


In [5]:
print(x_train[0])

The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .


It takes in a text X and return a label of 
- "1" if the sentiment of the text is positive,
- "-1" if the sentiment of the text is negative, and
- "0" if the sentiment of the text is neutral. 

***Extract_features(X)*** extracts a dictionary of (named) feature values from the text.
Create this by hand, and a simple example is shown for you.

***feature_weights***, a dictionary to assign a weight to each extracted feature.

In [6]:
# # Original code
# def extract_features(x: str) -> dict[str, float]:
#     features = {}
#     x_split = x.split(' ')
    
#     # Count the number of "good words" and "bad words" in the text
#     good_words = ['love', 'good', 'nice', 'great', 'enjoy', 'enjoyed']
#     bad_words = ['hate', 'bad', 'terrible', 'disappointing', 'sad', 'lost', 'angry']
#     for x_word in x_split:
#         if x_word in good_words:
#             features['good_word_count'] = features.get('good_word_count', 0) + 1
#         if x_word in bad_words:
#             features['bad_word_count'] = features.get('bad_word_count', 0) + 1
    
#     # The "bias" value is always one, to allow us to assign a "default" score to the text
#     features['bias'] = 1
    
#     return features

# feature_weights = {'good_word_count': 1.0, 'bad_word_count': -1.0, 'bias': 0.5}

In [7]:
import re
from collections import Counter
from typing import List

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop_words = set(nltk.corpus.stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords

good_words = set()
bad_words = set()

# Feature weights
feature_weights = {'good_word_count': 1.0, 'bad_word_count': -1.0, 'bias': 0.5}

import re

def preprocess_text(text: str) -> str:
    """
    Preprocess text by:
    - Correcting contractions dynamically using POS tagging
    - Removing extra spaces and handling misplaced spaces around punctuation and apostrophes
    - Enforcing single spaces between words
    - Converting text to lowercase
    """
    # Tokenize and POS-tag the text
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    # Reconstruct contractions dynamically
    corrected_tokens = []
    i = 0
    while i < len(tagged):
        word, tag = tagged[i]
        if word == "'":
            if i > 0 and i < len(tagged) - 1:
                prev_word, prev_tag = tagged[i - 1]
                next_word, next_tag = tagged[i + 1]

                # Handle contractions like "it 's" -> "it's"
                if next_word in ["s", "ll", "ve", "re", "d", "t"]:
                    corrected_tokens[-1] = prev_word + "'" + next_word
                    i += 1  # Skip the next word
                else:
                    corrected_tokens.append(word)
            else:
                corrected_tokens.append(word)
        else:
            corrected_tokens.append(word)
        i += 1

    # Reconstruct the text
    text = " ".join(corrected_tokens)

    # Handle misplaced spaces around punctuation (e.g., "word ," -> "word,")
    text = re.sub(r"\s+([,.!?])", r"\1", text)   # Remove space before punctuation
    text = re.sub(r"([,.!?])\s+", r"\1 ", text)  # Ensure space after punctuation

    # Replace or handle double dashes (-- or ––)
    text = re.sub(r"--+", "—", text)  # Convert multiple dashes to an em dash

    # Remove extra spaces and enforce single spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Convert to lowercase
    text = text.lower()

    return text

def create_sentiment_dictionaries(x_train: List[str], y_train: List[int]) -> None:
    """
    Dynamically create dictionaries of good words and bad words from x_train and y_train.
    Updates the global variables `good_words` and `bad_words`.
    """
    global good_words, bad_words
    good_words_counter = Counter()
    bad_words_counter = Counter()

    for text, label in zip(x_train, y_train):
        text = preprocess_text(text)
        words = nltk.word_tokenize(text)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        relevant_words = filter_words_by_pos(filtered_words)
        if label == 1:
            good_words_counter.update(relevant_words)
        elif label == -1:
            bad_words_counter.update(relevant_words)
    
    good_words = set(good_words_counter.keys()) - set(bad_words_counter.keys())
    bad_words = set(bad_words_counter.keys()) - set(good_words_counter.keys())

def filter_words_by_pos(words: List[str]) -> List[str]:
    """
    Retain only adjectives, adverbs, verbs, and nouns to focus on sentiment-related words.
    """
    relevant_pos_tags = {'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NN', 'NNS'}
    tagged_words = nltk.pos_tag(words)
    return [word for word, pos in tagged_words if pos in relevant_pos_tags]

def extract_features(x: str) -> dict[str, float]:
    """
    Extract features for sentiment analysis, including negation handling and sentence-level features.
    """
    features = Counter()
    x = preprocess_text(x)
    words = nltk.word_tokenize(x)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    relevant_words = filter_words_by_pos(filtered_words)

    # Count good and bad words
    features['good_word_count'] = sum(1 for word in relevant_words if word in good_words)
    features['bad_word_count'] = sum(1 for word in relevant_words if word in bad_words)

    # Negation handling
    negations = {"not", "never", "no"}
    negated = False
    for word in relevant_words:
        if word in negations:
            negated = not negated
            continue
        if negated:
            if word in good_words:
                features['bad_word_count'] += 1
            elif word in bad_words:
                features['good_word_count'] += 1
        negated = False if word.endswith((".", ",", "!", "?")) else negated

    # Add sentence-level features
    features['bias'] = 1  # Bias feature
    features['punctuation_count'] = x.count(",") + x.count("...")  # Punctuation
    features['length'] = len(filtered_words) / 100.0  # Normalized sentence length

    return dict(features)

# Create sentiment dictionaries
create_sentiment_dictionaries(x_train, y_train)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amiagarw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/amiagarw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/amiagarw/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
# Create sentiment dictionaries dynamically from x_train and y_train
# create_sentiment_dictionaries(x_train, y_train)

# Feature weights for scoring
# feature_weights = {'good_word_count': 1.0, 'bad_word_count': -1.0, 'bias': 0.5}

In [9]:
# Run the classifier over the training and dev (test) sets and calculate accuracy

In [10]:
def run_classifier(x: str) -> int:
    score = 0
    for feat_name, feat_value in extract_features(x).items():
        score = score + feat_value * feature_weights.get(feat_name, 0)
    if score > 0:
        return 1
    elif score < 0:
        return -1
    else:
        return 0

In [11]:
def calculate_accuracy(x_data: list[str], y_data: list[int]) -> float:
    total_number = 0
    correct_number = 0
    for x, y in zip(x_data, y_data):
        y_pred = run_classifier(x)
        total_number += 1
        if y == y_pred:
            correct_number += 1
    return correct_number / float(total_number)

In [12]:
label_count = {}
for y in y_test:
    if y not in label_count:
        label_count[y] = 0
    label_count[y] += 1
print(label_count)

{1: 444, 0: 229, -1: 428}


In [13]:
train_accuracy = calculate_accuracy(x_train, y_train)
test_accuracy = calculate_accuracy(x_test, y_test)
print(f'Train accuracy: {train_accuracy}')
print(f'Dev/test accuracy: {test_accuracy}')

Train accuracy: 0.7299859550561798
Dev/test accuracy: 0.5004541326067211


In [14]:
# Error Analysis
# The following two functions allow you to randomly observe some mistaken examples, which may help you improve the classifier.
# Feel free to write more sophisticated methods for error analysis as well.

In [15]:
import random
def find_errors(x_data, y_data):
    error_ids = []
    y_preds = []
    for i, (x, y) in enumerate(zip(x_data, y_data)):
        y_preds.append(run_classifier(x))
        if y != y_preds[-1]:
            error_ids.append(i)
    for _ in range(5):
        my_id = random.choice(error_ids)
        x, y, y_pred = x_data[my_id], y_data[my_id], y_preds[my_id]
        print(f'{x}\ntrue label: {y}\npredicted label: {y_pred}\n')

In [16]:
find_errors(x_train, y_train)

It 's dark and tragic , and lets the business of the greedy talent agents get in the way of saying something meaningful about facing death
true label: 0
predicted label: 1

Snipes relies too much on a scorchingly plotted dramatic scenario for its own good .
true label: 0
predicted label: 1

Anyone who gets chills from movies with giant plot holes will find plenty to shake and shiver about in ` The Ring . '
true label: 0
predicted label: -1

Qutting may be a flawed film , but it is nothing if not sincere .
true label: 0
predicted label: 1

A rote exercise in both animation and storytelling .
true label: -1
predicted label: 1

