In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report

dataset = pd.read_csv('datasets\story_emotion3.csv')

stopword = pd.read_csv('datasets\stopwords_tl.csv')
stopwords_set = set(stopword['stopword'])

stemmer = pd.read_csv('datasets\stem_tl.csv')
word_to_stem = dict(zip(stemmer['word'], stemmer['stem']))

replace_patterns = {
    re.compile(r"\bngayo\'y\b"): 'ngayon ay',
    re.compile(r"\bhangga\'t\b"): 'hanggang',
    re.compile(r"\b\'?y\b"): ' ay',
    re.compile(r"\b\'?t\b"): ' at',
    re.compile(r"\b\'?yan\b"): 'iyan',
    re.compile(r"\b\'?yo\b"): 'iyo',
    re.compile(r"\b\'?yon\b"): 'iyon',
    re.compile(r"\b\'?yun\b"): 'iyun',
    re.compile(r"\b\'?pagkat\b"): 'sapagkat',
    re.compile(r"\b\'?di\b"): 'hindi',
    re.compile(r"\b\'?kaw\b"): "ikaw",
    re.compile(r"\b\'?to\b"): 'ito',
    re.compile(r"\b\'?wag\b"): 'huwag',
    re.compile(r"\bgano\'n\b"): 'ganoon'
}

class_names = {
    1: 'fear',
    2: 'anger',
    3: 'joy',
    4: 'sadness',
    5: 'disgust',
    6: 'surprise'
}
def data_preprocess(text, replace_patterns, word_to_stem, stopwords_set):
    text = text.lower()

    for pattern, replacement in replace_patterns.items():
        text = pattern.sub(replacement, text)

    text = re.sub("[^a-zA-Z0-9\s?!]", '', text)
    tokens = word_tokenize(text)
    text = ' '.join([word_to_stem.get(word, word) for word in tokens if word.lower() not in stopwords_set])

    return text

dataset['text_preprocessed'] = dataset['text'].apply(data_preprocess, replace_patterns=replace_patterns, word_to_stem=word_to_stem, stopwords_set=stopwords_set)

class MulticlassSVM:
    def __init__(self, learning_rate=0.01, num_epochs=5, C=1.0):
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.C = C
        self.classes = None
        self.classifiers = {}

    def fit(self, X, Y):
        self.classes = np.unique(Y)

        for i, class_label in enumerate(self.classes):
            binary_labels = np.where(Y == class_label, 1, -1)
            weights, bias = self._train_one_vs_rest(X, binary_labels)
            self.classifiers[class_label] = {"weights": weights, "bias": bias}

    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.classes)))

        for i, class_label in enumerate(self.classes):
            weights = self.classifiers[class_label]["weights"]
            bias = self.classifiers[class_label]["bias"]
            predictions[:, i] = np.dot(X, weights) + bias

        predicted_labels = self.classes[np.argmax(predictions, axis=1)]

        return predicted_labels

    def _train_one_vs_rest(self, X, binary_labels):
        num_samples, num_features = X.shape
        weights = np.zeros(num_features)
        bias = 0

        for epoch in range(self.num_epochs):
            for i in range(num_samples):
                if binary_labels[i] * (np.dot(X[i], weights) + bias) < 1:
                    weights = weights + self.learning_rate * (
                        binary_labels[i] * X[i] - 2 * self.C * weights
                    )
                    bias = bias + self.learning_rate * binary_labels[i]

        return weights, bias

vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X = dataset['text']
Y = dataset['emotion']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

X_train_tfidf = tfidf_transformer.fit_transform(X_train_vectorized)
X_test_tfidf = tfidf_transformer.transform(X_test_vectorized)

# Create an instance of the MulticlassSVM classifier
svm_classifier = MulticlassSVM(C=0.1)

# Fit the classifier on the training data
svm_classifier.fit(X_train_tfidf.toarray(), Y_train.values)

# Predict the labels for the test set
Y_pred = svm_classifier.predict(X_test_tfidf.toarray())


accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:\n", classification_report(Y_test, Y_pred, target_names=class_names.values()))

Accuracy: 0.6290322580645161

Classification Report:
               precision    recall  f1-score   support

        fear       0.73      0.55      0.63        20
       anger       0.59      0.48      0.53        21
         joy       0.74      0.67      0.70        21
     sadness       0.80      0.38      0.52        21
     disgust       0.83      0.75      0.79        20
    surprise       0.44      0.95      0.61        21

    accuracy                           0.63       124
   macro avg       0.69      0.63      0.63       124
weighted avg       0.69      0.63      0.63       124

