# Prepare Data


In [None]:
!rm -rf /content/sample_data

In [None]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:00<00:00, 43.4MB/s]


# Import Lib


In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Prepare Training Data
### Lemmatizer + NLTK

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('news-NLP.csv')

In [None]:
df['content'] = df['title'] + ' ' + df['text']
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [None]:
df['processed_content'] = df['content'].apply(preprocess_text)

# Word2Vec Model

In [None]:
word2vec_model = Word2Vec(sentences=df['processed_content'].tolist(), vector_size=100, window=5, min_count=5, workers=4)

In [None]:
def document_vector(doc):
    doc = [word for word in doc if word in word2vec_model.wv]
    return np.mean([word2vec_model.wv[word] for word in doc], axis=0) if doc else np.zeros(100)

In [None]:
X = np.array([document_vector(text) for text in df['processed_content']])
y = df['label'].values

# Training With LSTM

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

  super().__init__(**kwargs)


Epoch 1/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7442 - loss: 0.5363 - val_accuracy: 0.8792 - val_loss: 0.2781
Epoch 2/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8790 - loss: 0.2860 - val_accuracy: 0.8879 - val_loss: 0.2701
Epoch 3/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8851 - loss: 0.2724 - val_accuracy: 0.8950 - val_loss: 0.2593
Epoch 4/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8955 - loss: 0.2610 - val_accuracy: 0.8990 - val_loss: 0.2518
Epoch 5/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8951 - loss: 0.2634 - val_accuracy: 0.9013 - val_loss: 0.2483
Epoch 6/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8859 - loss: 0.2548 - val_accuracy: 0.8966 - val_loss: 0.2454
Epoch 7/10
[1m159/159[0m 

# Evaluate
### Accuracy, Precision, Recall, F1-Score

In [None]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy * 100))

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('Precision: %.2f' % (precision * 100))
print('Recall: %.2f' % (recall * 100))
print('F1-Score: %.2f' % (f1 * 100))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9091 - loss: 0.2269
Accuracy: 90.61
Precision: 88.74
Recall: 92.83
F1-Score: 90.74


# Test Sentence Real or Fake

In [None]:
test_sentence = "This is a sample sentence to check if it is real or fake."
processed_sentence = preprocess_text(test_sentence)
sentence_vector = document_vector(processed_sentence).reshape(1, -1)
predicted_label = model.predict(sentence_vector)
if predicted_label[0] == 1:
    print("The sentence is classified as: Fake news")
else:
    print("The sentence is classified as: Real news")

In [None]:
test_sentence = "U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism."
processed_sentence = preprocess_text(test_sentence)
sentence_vector = document_vector(processed_sentence).reshape(1, -1)
predicted_label = model.predict(sentence_vector)
if predicted_label[0] == 1:
    print("The sentence is classified as: Fake news")
else:
    print("The sentence is classified as: Real news")