# Prepare Data


In [1]:
!rm -rf /content/sample_data

In [2]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:01<00:00, 20.0MB/s]


# Import Lib


In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare Training Data
### Lemmatizer + NLTK

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
df = pd.read_csv('news-NLP.csv')
df = df.drop(df.columns[0], axis=1)

In [6]:
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)
df['content'] = df['title'] + ' ' + df['text']

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [9]:
df['processed_content'] = df['content'].apply(preprocess_text)

# GloVe Model


In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-12-04 11:27:50--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-12-04 11:27:50--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-12-04 11:27:51--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
embedding_dim = 100
word_index = {word: index for index, word in enumerate(set([word for sublist in df['processed_content'] for word in sublist]))}
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

max_length = 100
sequences = [[word_index[word] for word in text if word in word_index] for text in df['processed_content']]
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], test_size=0.2, random_state=42)

# Training With LSTM

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization

model = Sequential()

model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                    input_length=max_length, trainable=False))

model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.summary()

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    callbacks=[early_stopping]
)



Epoch 1/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 203ms/step - accuracy: 0.5938 - loss: 0.7237 - val_accuracy: 0.6963 - val_loss: 0.5832
Epoch 2/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 209ms/step - accuracy: 0.7203 - loss: 0.5675 - val_accuracy: 0.7485 - val_loss: 0.5296
Epoch 3/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 213ms/step - accuracy: 0.7494 - loss: 0.5212 - val_accuracy: 0.6568 - val_loss: 0.5932
Epoch 4/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 238ms/step - accuracy: 0.7126 - loss: 0.5665 - val_accuracy: 0.7446 - val_loss: 0.5253
Epoch 5/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 224ms/step - accuracy: 0.7583 - loss: 0.4923 - val_accuracy: 0.8225 - val_loss: 0.4790
Epoch 6/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 228ms/step - accuracy: 0.7469 - loss: 0.5042 - val_accuracy: 0.7712 - val_loss: 0.4854
Epoch 7/20

In [25]:
model.save('model_glove_lstm.h5')



# Evaluate
### Accuracy, Precision, Recall, F1-Score

In [21]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy * 100))
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('Precision: %.2f' % (precision * 100))
print('Recall: %.2f' % (recall * 100))
print('F1-Score: %.2f' % (f1 * 100))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.8731 - loss: 0.3118
Accuracy: 86.42
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step
Precision: 85.96
Recall: 86.78
F1-Score: 86.37


# Test Sentence Real or Fake

In [None]:
sentence = "This is a sample sentence to check if it is real or fake."
processed_sentence = preprocess_text(sentence)

sequence = [[word_index[word] for word in processed_sentence if word in word_index]]
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')

prediction = model.predict(padded_sequence)
print(prediction[0][0])

if prediction[0][0] < 0.5:
  print("The sentence is classified as FAKE.")
else:
  print("The sentence is classified as REAL.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
0.42140833
The sentence is classified as FAKE.
