In [1]:
import joblib
import re
import string
from nltk.stem import PorterStemmer
import json
import nltk

In [2]:
# Text preprocessing
def wordopt(text):
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

ps = PorterStemmer()
with open('resources/stopwords-tl.json', 'r') as f:
    stopwords = json.load(f)

def preprocess(text):
    text = wordopt(text)
    tokens = nltk.word_tokenize(text)
    stems = [ps.stem(token) for token in tokens]
    filtered = [stem for stem in stems if stem not in stopwords]
    return filtered

def predict(text):
    processed = preprocess(text)

    # Load vectorizer
    vectorizer = joblib.load('resources/vectorizer_mlp.joblib')

    # Transform the validation data using the same vectorizer
    tfidf = vectorizer.transform(processed)

    model = joblib.load('resources/stack_mlp.joblib')  # Update with your model path
    # Make prediction using the loaded model
    # prediction = model.predict_proba(tfidf)[0]  # Assuming binary prediction (0 or 1)
    prediction = model.predict_proba(tfidf)[0]  # Assuming binary prediction (0 or 1)
    value1 = prediction[0]  # Probability for class 0
    value2 = prediction[1]  # Probability for class 1
    return value1, value2

In [3]:
input_text = 'Sapu kamuwang-muwang at ang napansin nga namin ang malakas na usok ay napupunta sa bata, makikita nantin sa surveilance iyan. Ibig sabihin ay talamak na talaga itong mag-asawan kontra-droga? Source: GMA News'
# input_text = 'Humihiling ngayon ng tulong ang 12-years old brain cancer patient na si Jinn Sam sa mga taong may mabubuting puso at'
value1, value2 = predict(input_text)


# Now you can use value1 and value2 separately
print("Probability for class 0:", value1)
print("Probability for class 1:", value2)
# Class 0 = False
# Class 1 = True

Probability for class 0: 0.9863001389733473
Probability for class 1: 0.013699861026652606


In [4]:
# input_text = 'Sapu kamuwang-muwang at ang napansin nga namin ang malakas na usok ay napupunta sa bata, makikita nantin sa surveilance iyan. Ibig sabihin ay talamak na talaga itong mag-asawan kontra-droga? Source: GMA News'
input_text2 = 'Humihiling ngayon ng tulong ang 12-years old brain cancer patient na si Jinn Sam sa mga taong may mabubuting puso at'
value1, value2 = predict(input_text2)


# Now you can use value1 and value2 separately
print("Probability for class 0:", value1)
print("Probability for class 1:", value2)
# Class 0 = False
# Class 1 = True

Probability for class 0: 0.9925405159401599
Probability for class 1: 0.007459484059840081


Instead of using a threshold, we will be using Weighted Decision.

If value1 == value2 (which means 50:50), it will interpret as Fake News.

Why?
Class 0 (Fake Classe) is more important than the other. In the application, false negatives are more critical. A false negative occurs when the model fails to identify a piece of news as fake when it is indeed fake. This means the fake news goes undetected or is mistakenly classified as genuine.
Reducing false negatives is crucial in fake news detection to ensure the effectiveness of the detection system