# Prepare Data


In [None]:
!rm -rf /content/sample_data

In [None]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:00<00:00, 39.3MB/s]


# Import Lib


In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare Training Data
### Lemmatizer + NLTK

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = pd.read_csv('news-NLP.csv')
df = df.drop(df.columns[0], axis=1)

In [None]:
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)
df['content'] = df['title'] + ' ' + df['text']

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [None]:
df['processed_content'] = df['content'].apply(preprocess_text)

# GloVe Model


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-11-22 16:58:43--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-22 16:58:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-22 16:58:44--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
embedding_dim = 100
word_index = {word: index for index, word in enumerate(set([word for sublist in df['processed_content'] for word in sublist]))}
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

max_length = 100
sequences = [[word_index[word] for word in text if word in word_index] for text in df['processed_content']]

padded_sequences = np.array([np.pad(seq, (0, max(0, max_length - len(seq))))[:max_length] for seq in sequences])

# Training With Naive Bayes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], test_size=0.2, random_state=42)

X_train_avg = np.array([np.mean(embedding_matrix[seq], axis=0) for seq in X_train])
X_test_avg = np.array([np.mean(embedding_matrix[seq], axis=0) for seq in X_test])

gnb = GaussianNB()
gnb.fit(X_train_avg, y_train)

y_pred = gnb.predict(X_test_avg)

# Evaluate
### Accuracy, Precision, Recall, F1-Score

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy: %.2f' % (accuracy * 100))
print('Precision: %.2f' % (precision * 100))
print('Recall: %.2f' % (recall * 100))
print('F1-Score: %.2f' % (f1 * 100))

Accuracy: 72.38
Precision: 71.86
Recall: 72.77
F1-Score: 72.31


# Test Sentence Real or Fake

In [None]:
test_sentence = "This is a sample sentence to check if it is real or fake."
processed_sentence = preprocess_text(test_sentence)
sentence_vector = np.mean(embedding_matrix[[word_index[word] for word in processed_sentence if word in word_index]], axis=0)
sentence_vector = sentence_vector.reshape(1, -1)
predicted_label = gnb.predict(sentence_vector)
if predicted_label[0] == 1:
    print("The sentence is classified as: Fake news")
else:
    print("The sentence is classified as: Real news")

The sentence is classified as: Fake news


In [None]:
test_sentence = "U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism."
processed_sentence = preprocess_text(test_sentence)
sentence_vector = np.mean(embedding_matrix[[word_index[word] for word in processed_sentence if word in word_index]], axis=0)
sentence_vector = sentence_vector.reshape(1, -1)
predicted_label = gnb.predict(sentence_vector)
if predicted_label[0] == 1:
    print("The sentence is classified as: Fake news")
else:
    print("The sentence is classified as: Real news")

The sentence is classified as: Real news
