In [None]:
!rm -rf /content/sample_data

In [None]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:00<00:00, 87.0MB/s]


In [None]:
import pandas as pd
import gensim
from gensim.models import FastText
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('news-NLP.csv')
df = df.drop(df.columns[0], axis=1)

In [None]:
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)
df['content'] = df['title'] + ' ' + df['text']

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [None]:
df['processed_content'] = df['content'].apply(preprocess_text)

In [None]:
fasttext_model = FastText(sentences=df['processed_content'], vector_size=100, window=5, min_count=5, workers=4, sg=0, epochs=10)

In [None]:
embedding_matrix = np.zeros((len(fasttext_model.wv.key_to_index) + 1, 100))  # +1 for padding
word_index = {word: idx + 1 for idx, word in enumerate(fasttext_model.wv.key_to_index)}
for word, idx in word_index.items():
    embedding_matrix[idx] = fasttext_model.wv[word]

In [None]:
def text_to_sequence(text, word_index):
    return [word_index[word] for word in text if word in word_index]

df['sequence'] = df['processed_content'].apply(lambda x: text_to_sequence(x, word_index))

In [None]:
max_seq_len = 200
X = pad_sequences(df['sequence'], maxlen=max_seq_len, padding='post')
y = df['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = Sequential([
    Embedding(
        input_dim=len(embedding_matrix),
        output_dim=100,
        weights=[embedding_matrix],
        input_length=max_seq_len,
        trainable=False  # Freeze embedding layer
    ),
    LSTM(128, return_sequences=False),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/20




[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.6910 - loss: 0.5875 - val_accuracy: 0.7978 - val_loss: 0.4211
Epoch 2/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8517 - loss: 0.3838 - val_accuracy: 0.8393 - val_loss: 0.3844
Epoch 3/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8709 - loss: 0.3242 - val_accuracy: 0.8432 - val_loss: 0.3725
Epoch 4/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8990 - loss: 0.2822 - val_accuracy: 0.8560 - val_loss: 0.3590
Epoch 5/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9214 - loss: 0.2200 - val_accuracy: 0.8531 - val_loss: 0.3785
Epoch 6/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9329 - loss: 0.1978 - val_accuracy: 0.7396 - val_loss: 0.6280
Epoch 7/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype(int)

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.846093133385951
Precision: 0.8431061806656102
Recall: 0.8471337579617835
F1 Score: 0.8451151707704527


In [None]:
model.save("model_fasttext_lstm.h5")



In [None]:
import pandas as pd
import gensim
from gensim.models import FastText
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
fasttext_model = FastText.load("fasttext_model_lstm.bin")
model = load_model("model_fasttext_lstm.h5")

# Word index from the training process (make sure this matches the training data)
word_index = {word: idx + 1 for idx, word in enumerate(fasttext_model.wv.key_to_index)}

# Preprocessing function (same as in training)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

def text_to_sequence(text, word_index):
    return [word_index[word] for word in text if word in word_index]

def predict_fake_news(news_text):
    processed_text = preprocess_text(news_text)
    sequence = text_to_sequence(processed_text, word_index)
    max_seq_len = 200  # Same max sequence length as in training
    padded_sequence = pad_sequences([sequence], maxlen=max_seq_len, padding='post')
    prediction = model.predict(padded_sequence)[0][0]
    print(prediction)
    if prediction > 0.5:
        return "FAKE"
    else:
        return "REAL"


news_text = "This is a sample sentence to check if it is real or fake."
prediction = predict_fake_news(news_text)
print(f"The news is predicted to be: {prediction}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
0.7883114
The news is predicted to be: FAKE
