In [None]:
!rm -rf /content/sample_data

In [None]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:01<00:00, 30.6MB/s]


In [None]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from transformers import BertModel, BertTokenizer
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('news-NLP.csv')
df = df.drop(df.columns[0], axis=1)

In [None]:
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)
df['content'] = df['title'] + ' ' + df['text']

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def get_bert_embeddings_batch(text_list, tokenizer, bert_model, batch_size=32):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(cls_embeddings)
    embeddings = np.vstack(embeddings)
    return embeddings


In [None]:
X = get_bert_embeddings_batch(df['content'].tolist(), tokenizer, bert_model, batch_size=16)
y = df['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [None]:
input_size = X_train.shape[1]
hidden_size = 128
num_layers = 2
num_classes = 2
model = LSTMModel(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_epochs = 10

for epoch in range(num_epochs):
    for i, (data, labels) in enumerate(train_loader):
        data, labels = data.to(device), labels.to(device)
        data = data.unsqueeze(1)
        outputs = model(data)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [100/159], Loss: 0.5427
Epoch [2/10], Step [100/159], Loss: 0.3678
Epoch [3/10], Step [100/159], Loss: 0.3427
Epoch [4/10], Step [100/159], Loss: 0.2518
Epoch [5/10], Step [100/159], Loss: 0.2175
Epoch [6/10], Step [100/159], Loss: 0.0746
Epoch [7/10], Step [100/159], Loss: 0.1563
Epoch [8/10], Step [100/159], Loss: 0.0225
Epoch [9/10], Step [100/159], Loss: 0.0346
Epoch [10/10], Step [100/159], Loss: 0.0219


In [None]:
# prompt: lưu model đã train ở trên bằng joblib hoặc pickle

import joblib

# Save the trained model
joblib.dump(model, 'trained_model.joblib')

In [None]:
y_pred_list = []
with torch.no_grad():
    for data, labels in test_loader:
        data, labels = data.to(device), labels.to(device)
        data = data.unsqueeze(1)
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
        y_pred_list.extend(predicted.cpu().numpy())

y_pred = np.array(y_pred_list)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8966
Precision: 0.9510
Recall: 0.8344
F1 Score: 0.8889


In [None]:
# prompt: load model từ trained_model.joblib rồi kiểm tra một câu là fake hay real

import joblib
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

# Load the saved model
loaded_model = joblib.load('trained_model.joblib')

# Load the tokenizer and BERT model (necessary for preprocessing)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text, tokenizer, bert_model):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_embeddings

def predict_fake_or_real(text, model, tokenizer, bert_model):
    embeddings = get_bert_embeddings(text, tokenizer, bert_model)
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embeddings_tensor = embeddings_tensor.to(device)
    embeddings_tensor = embeddings_tensor.unsqueeze(1) # Add the sequence dimension
    with torch.no_grad():
        output = model(embeddings_tensor)
        _, predicted = torch.max(output, 1)

    return "FAKE" if predicted.item() == 1 else "REAL"

# Example usage:
text_to_check = "This is a sample news text. Is this fake or real?"
prediction = predict_fake_or_real(text_to_check, loaded_model, tokenizer, bert_model)
print(f"The news is predicted to be: {prediction}")