# Prepare Data

In [None]:
!rm -rf /content/sample_data

In [None]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:00<00:00, 38.7MB/s]


# Import Lib

In [None]:
!pip install scikit-learn
import pandas as pd
from gensim.models import FastText
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler



# Prepare Data Training

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('news-NLP.csv')
df = df.drop(df.columns[0], axis=1)

In [None]:
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)
df['content'] = df['title'] + ' ' + df['text']

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [None]:
df['processed_content'] = df['content'].apply(preprocess_text)

# FastText Model

In [None]:
fasttext_model = FastText(sentences=df['processed_content'], vector_size=100, window=5, min_count=5, workers=4, sg=0, epochs=10)

In [None]:
def document_vector(doc, model):
    words = [word for word in doc if word in model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

In [None]:
df['doc_vector'] = df['processed_content'].apply(lambda x: document_vector(x, fasttext_model))

In [None]:
X = np.vstack(df['doc_vector'].values)
y = df['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
fasttext_model.save("fasttext_model.bin")

# Training with Naive Bayes

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = MultinomialNB()
model.fit(X_train_scaled, y_train)

# Evaluate

In [None]:
y_pred = model.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy : {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy : 0.8042620363062352
Precision: 0.7785923753665689
Recall: 0.8455414012738853
F1 Score: 0.8106870229007633


In [None]:
import joblib

# Save the MinMaxScaler
joblib.dump(scaler, 'minmax_scaler_fT.pkl')

# Save the MultinomialNB model
joblib.dump(model, 'model_fasttext_nb.pkl')

['model_fasttext_nb.pkl']

In [None]:
# prompt: kiểm tra 1 sentence là real hay fake

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import FastText
import joblib

# Load the saved model and scaler
model = joblib.load('model_fasttext_nb.pkl')
scaler = joblib.load('minmax_scaler_fT.pkl')
fasttext_model = FastText.load("fasttext_model.bin")

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

def document_vector(doc, model):
    words = [word for word in doc if word in model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

def predict_fake_news(text):
    processed_text = preprocess_text(text)
    doc_vec = document_vector(processed_text, fasttext_model)
    doc_vec = doc_vec.reshape(1, -1)
    scaled_vec = scaler.transform(doc_vec)
    prediction = model.predict(scaled_vec)
    return "FAKE" if prediction[0] == 1 else "REAL"

# Example usage:
sentence = "This is an example sentence."
prediction = predict_fake_news(sentence)
print(f"The sentence is predicted as: {prediction}")