# Prepare Data


In [None]:
!rm -rf /content/sample_data

In [None]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:00<00:00, 57.2MB/s]


# Import Lib


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare Training Data
### Lemmatizer + NLTK

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('news-NLP.csv')
df = df.drop(df.columns[0], axis=1)

In [None]:
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)
df['content'] = df['title'] + ' ' + df['text']

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [None]:
df['processed_content'] = df['content'].apply(preprocess_text)

# GloVe Model


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-11-22 16:31:12--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-22 16:31:12--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-22 16:31:12--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# Load GloVe embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Create embedding matrix
embedding_dim = 100
word_index = {word: index for index, word in enumerate(set([word for sublist in df['processed_content'] for word in sublist]))}
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Convert text to sequences of indices
max_length = 100
sequences = [[word_index[word] for word in text if word in word_index] for text in df['processed_content']]
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], test_size=0.2, random_state=42)

# Training With LSTM

In [None]:
model = Sequential()

model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                    input_length=max_length, trainable=False))

model.add(LSTM(64, return_sequences=True, recurrent_dropout=0.2))
model.add(Dropout(0.3))

model.add(LSTM(32, recurrent_dropout=0.2))
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 504ms/step - accuracy: 0.6212 - loss: 0.6489 - val_accuracy: 0.7151 - val_loss: 0.5449
Epoch 2/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 388ms/step - accuracy: 0.7188 - loss: 0.5577 - val_accuracy: 0.7822 - val_loss: 0.4772
Epoch 3/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 497ms/step - accuracy: 0.7448 - loss: 0.5241 - val_accuracy: 0.7719 - val_loss: 0.4810
Epoch 4/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 355ms/step - accuracy: 0.7698 - loss: 0.4967 - val_accuracy: 0.7901 - val_loss: 0.4746
Epoch 5/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 376ms/step - accuracy: 0.6514 - loss: 0.6537 - val_accuracy: 0.7096 - val_loss: 0.6065
Epoch 6/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 378ms/step - accuracy: 0.6771 - loss: 0.6017 - val_accuracy: 0.6590 - val_loss: 0.6065
Epoch 7/10
[1m80/80[

<keras.src.callbacks.history.History at 0x7ad7665b99f0>

# Evaluate
### Accuracy, Precision, Recall, F1-Score

In [None]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy * 100))
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('Precision: %.2f' % (precision * 100))
print('Recall: %.2f' % (recall * 100))
print('F1-Score: %.2f' % (f1 * 100))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 162ms/step - accuracy: 0.8246 - loss: 0.4501
Accuracy: 81.22
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 135ms/step
Precision: 76.35
Recall: 89.97
F1-Score: 82.60


# Test Sentence Real or Fake

In [None]:
test_sentence = "U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism."
processed_sentence = preprocess_text(test_sentence)
sentence_vector = pad_sequences([[word_index[word] for word in processed_sentence if word in word_index]],
                                maxlen=max_length, padding='post', truncating='post')
predicted_label = model.predict(sentence_vector)
if predicted_label[0] == 1:
    print("The sentence is classified as: Fake news")
else:
    print("The sentence is classified as: Real news")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
The sentence is classified as: Real news
