In [2]:
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import joblib

dataset = pd.read_csv('datasets\story_emotion4.csv', encoding='ISO-8859-1')

stopword = pd.read_csv('datasets\stopwords_tl.csv')
stopwords_set = set(stopword['stopword'])

stemmer = pd.read_csv('datasets\stem_tl.csv')
word_to_stem = dict(zip(stemmer['word'], stemmer['stem']))

replace_patterns = {
    re.compile(r"\bngayo\'y\b"): 'ngayon ay',
    re.compile(r"\bhangga\'t\b"): 'hanggang',
    re.compile(r"\b\'?y\b"): ' ay',
    re.compile(r"\b\'?t\b"): ' at',
    re.compile(r"\b\'?yan\b"): 'iyan',
    re.compile(r"\b\'?yo\b"): 'iyo',
    re.compile(r"\b\'?yon\b"): 'iyon',
    re.compile(r"\b\'?yun\b"): 'iyun',
    re.compile(r"\b\'?pagkat\b"): 'sapagkat',
    re.compile(r"\b\'?di\b"): 'hindi',
    re.compile(r"\b\'?kaw\b"): "ikaw",
    re.compile(r"\b\'?to\b"): 'ito',
    re.compile(r"\b\'?wag\b"): 'huwag',
    re.compile(r"\bgano\'n\b"): 'ganoon'
}

def data_preprocess(text, replace_patterns, word_to_stem, stopwords_set):
    text = text.lower()

    for pattern, replacement in replace_patterns.items():
        text = pattern.sub(replacement, text)

    text = re.sub("[^a-zA-Z0-9\s?!]", '', text)
    tokens = word_tokenize(text)
    text = ' '.join([word_to_stem.get(word, word) for word in tokens if word.lower() not in stopwords_set])

    return text

dataset['text'] = dataset['text'].apply(data_preprocess, replace_patterns=replace_patterns, word_to_stem=word_to_stem, stopwords_set=stopwords_set)

vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X = dataset['text']
Y = dataset['emotion']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

X_train_tfidf = tfidf_transformer.fit_transform(X_train_vectorized)
X_test_tfidf = tfidf_transformer.transform(X_test_vectorized)

svm = SVC(kernel='linear', C=0.1, random_state=42)
classifier = OneVsRestClassifier(svm)

classifier.fit(X_train_tfidf, Y_train)
Y_pred = classifier.predict(X_test_tfidf)

joblib.dump((classifier, vectorizer, tfidf_transformer), 'OVA_Linear_model.pkl')

['OVA_Linear_model.pkl']

In [3]:
import re
import joblib
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
from langdetect import detect

# Load the trained model and vectorizer from the file
model_components = joblib.load('OVA_Linear_model.pkl')
SVM_model, vectorizer, tfidf_transformer = model_components

# Define the same preprocessing functions and data structures used during training
stopword = pd.read_csv('datasets\stopwords_tl.csv')
stopwords_set = set(stopword['stopword'])

stemmer = pd.read_csv('datasets\stem_tl.csv')
word_to_stem = dict(zip(stemmer['word'], stemmer['stem']))

replace_patterns = {
    re.compile(r"\bngayo\'y\b"): 'ngayon ay',
    re.compile(r"\bhangga\'t\b"): 'hanggang',
    re.compile(r"\b\'?y\b"): ' ay',
    re.compile(r"\b\'?t\b"): ' at',
    re.compile(r"\b\'?yan\b"): 'iyan',
    re.compile(r"\b\'?yo\b"): 'iyo',
    re.compile(r"\b\'?yon\b"): 'iyon',
    re.compile(r"\b\'?yun\b"): 'iyun',
    re.compile(r"\b\'?pagkat\b"): 'sapagkat',
    re.compile(r"\b\'?di\b"): 'hindi',
    re.compile(r"\b\'?kaw\b"): "ikaw",
    re.compile(r"\b\'?to\b"): 'ito',
    re.compile(r"\b\'?wag\b"): 'huwag',
    re.compile(r"\bgano\'n\b"): 'ganoon'
}

foul_words = {
    'gago','gaga', 'puta', 'pakyu','pakshet','buang','walanghiya ','piste','lintik',
    'putangina','tarantado','punyeta','bwisit','kupal','hinyupak', 'tanga', 'tangina','bobo','boba','putragis', 'syet'
}

class_names = {
    1: 'fear',
    2: 'anger',
    3: 'joy',
    4: 'sadness',
    5: 'disgust',
    6: 'surprise'
}
def data_preprocess(text, replace_patterns, word_to_stem, stopwords_set):
    text = text.lower()

    for pattern, replacement in replace_patterns.items():
        text = pattern.sub(replacement, text)

    text = re.sub("[^a-zA-Z0-9\s?!]", '', text)
    tokens = word_tokenize(text)
    text = ' '.join([word_to_stem.get(word, word) for word in tokens if word.lower() not in stopwords_set])

    return text

user_input = input("Enter a text: ")

try:
    lang = detect(user_input)
except Exception as e:
    lang = None

if lang.lower() != 'tl':
    print("Error: The system currently only accepts Tagalog words.")
else:
    user_input_processed = data_preprocess(user_input, replace_patterns, word_to_stem, stopwords_set)

    if any(word in user_input_processed.lower() for word in foul_words):
        print("Warning: There are words that are not appropriate for children to read.")
    else:
        user_input_vectorized = vectorizer.transform([user_input_processed])
        user_input_tfidf = tfidf_transformer.transform(user_input_vectorized)

        decision_values = SVM_model.decision_function(user_input_tfidf)[0]

        exp_values = np.exp(decision_values - np.max(decision_values))  
        probabilities = exp_values / exp_values.sum(axis=0, keepdims=True)

        emotion_probabilities_dict = {class_names[i+1]: probability * 100 for i, probability in enumerate(probabilities)}

        for emotion in class_names.values():
            if emotion not in emotion_probabilities_dict:
                emotion_probabilities_dict[emotion] = 0.0

        print("\nEmotion probabilities:")
        for emotion, percentage in emotion_probabilities_dict.items():
            print(f"{emotion}: {percentage:.2f}%")

        max_emotion = max(emotion_probabilities_dict, key=emotion_probabilities_dict.get)

        print(f"\nThe predicted emotion for the input text is: {max_emotion}")

Enter a text: nakakadire ka tingnan

Emotion probabilities:
fear: 16.78%
anger: 16.09%
joy: 16.69%
sadness: 16.19%
disgust: 18.64%
surprise: 15.60%

The predicted emotion for the input text is: disgust
