In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout



In [None]:
# Load your dataset
# Assume you have a dataset with two columns: 'text' for the news content and 'label' for the classification (fake/real)
# Replace 'your_dataset.csv' with your dataset file path

truenews = pd.read_csv('/content/True.csv')
fakenews = pd.read_csv('/content/Fake.csv')

In [None]:
# Preprocessing
print(len(truenews))
print(len(fakenews))
truenews = truenews[truenews.duplicated('text') == False]
fakenews = fakenews[fakenews.duplicated('text') == False]
print(len(truenews))
print(len(fakenews))


truenews['True/Fake']=1 #'True'
fakenews['True/Fake']=0 #'Fake'
news = pd.concat([truenews, fakenews])
news["Article"] = news["title"] + news["text"]
news.sample(frac = 1) #Shuffle 100%




21417
23481
21192
17455


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  truenews['True/Fake']=1 #'True'


Unnamed: 0,title,text,subject,date,True/Fake,Article
20081,Hammond says UK 'very close' to deal on EU cit...,BUDAPEST (Reuters) - Britain is very close to ...,worldnews,"September 14, 2017",1,Hammond says UK 'very close' to deal on EU cit...
5933,Peru and Colombia vow to stand with Mexico aft...,LIMA (Reuters) - Peru and Colombia vowed to st...,politicsNews,"January 27, 2017",1,Peru and Colombia vow to stand with Mexico aft...
17095,Why Are We Taking A Huge Number Of Muslim Refu...,John Kerry just announced that America will be...,Government News,"Sep 20, 2015",0,Why Are We Taking A Huge Number Of Muslim Refu...
15284,Singapore detains two for 'terrorism-related' ...,SINGAPORE (Reuters) - Singapore said on Thursd...,worldnews,"November 9, 2017",1,Singapore detains two for 'terrorism-related' ...
16669,EU's Juncker says EU will reach a fair Brexit ...,STRASBOURG (Reuters) - The European Commission...,worldnews,"October 24, 2017",1,EU's Juncker says EU will reach a fair Brexit ...
...,...,...,...,...,...,...
21328,WATCH #BlackLivesMatter Students Panic When As...,"After watching this video, it becomes very cle...",left-news,"Nov 16, 2015",0,WATCH #BlackLivesMatter Students Panic When As...
9570,Trump touts 'great' day in D.C. meeting with R...,WASHINGTON (Reuters) - Presumptive Republican ...,politicsNews,"May 12, 2016",1,Trump touts 'great' day in D.C. meeting with R...
4880,Republican lawmakers: health plan tax credit c...,WASHINGTON (Reuters) - U.S. Republicans’ propo...,politicsNews,"March 17, 2017",1,Republican lawmakers: health plan tax credit c...
2099,"Amid Trump backlash, his U.N. envoy says stand...",WASHINGTON (Reuters) - U.S. Ambassador to the ...,politicsNews,"August 19, 2017",1,"Amid Trump backlash, his U.N. envoy says stand..."


In [None]:
import nltk

nltk.download('words')

from nltk.corpus import words

# Your array of text
article_array = news["Article"]

# Set of English words
english_word_set = set(words.words())

# Function to filter words based on the English dictionary
def filter_english_words(article):
    #return ' '.join([word.lower() for word in article.split() if word.lower() in english_word_set and word.lower() not in set(stopwords.words('english'))])
    return ' '.join([word.lower() for word in article.split() if word.lower() in english_word_set])

# Apply the filter to each article in the array
filtered_articles = [filter_english_words(article) for article in article_array]

# Update the original array with the filtered results
news['Clean Text'] = filtered_articles

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=10000)  # Consider only top 10,000 words
tokenizer.fit_on_texts(news['Clean Text'])
X = tokenizer.texts_to_sequences(news['Clean Text'])
X = pad_sequences(X, maxlen=500)  # Padding sequences to have a fixed length



In [None]:
y = news['True/Fake']

In [None]:
# Splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Building the model

class Model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.model = Sequential([
            Embedding(input_dim=10000, output_dim=64, input_length=500),
            Bidirectional(LSTM(64)),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
            ])

    def call(self, inputs, training):
        if training:
          return self.model(inputs)
        else:
          print(tf.math.round(self.model(inputs)))
          return tf.math.round(self.model(inputs))


model = Model()



In [None]:
# Compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
# Training the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))
#model.model.load_weights("NN_fake_news_classification.h5")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b8ff9081030>

In [None]:
model.model.save("NN_fake_news_classification.keras")



In [None]:
from tensorflow.keras.models import load_model

# Load the model
#model = load_model("NN_fake_news_classification.h5")

In [None]:
# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)



Test Accuracy: 0.9507114887237549


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predicting on test data
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)  # Converting probabilities to binary predictions

# Getting classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Getting confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      3486
           1       0.93      0.99      0.96      4244

    accuracy                           0.95      7730
   macro avg       0.96      0.95      0.95      7730
weighted avg       0.95      0.95      0.95      7730

Confusion Matrix:
[[3163  323]
 [  58 4186]]


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


reddit_before = pd.read_csv('/content/before_top_100_final.csv', encoding='ISO-8859-1')
reddit_before = reddit_before.dropna(subset=['title', 'text'])




# Assuming your new data is stored in a DataFrame named 'new_data'
# Combine 'headline' and 'text' columns
reddit_before["Article"] = reddit_before["title"] + " " + reddit_before["text"]




In [None]:
nltk.download('words')

from nltk.corpus import words

# Your array of text
article_array = reddit_before["Article"]

# Set of English words
english_word_set = set(words.words())

# Function to filter words based on the English dictionary
def filter_english_words(article):
    if isinstance(article, str):  # Check if the value is a string
        return ' '.join([word.lower() for word in article.split() if word.lower() in english_word_set])
    else:
        return ''  # Return an empty string for non-string values

# Apply the filter to each article in the array
filtered_articles = [filter_english_words(article) for article in article_array]

# Update the original array with the filtered results
reddit_before['Clean Text'] = filtered_articles

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
# Tokenization
X_new = tokenizer.texts_to_sequences(filtered_articles)
X_new = pad_sequences(X_new, maxlen=500)

# Predict labels for the new data using the trained model
predictions = model.predict(X_new)

# Convert probabilities to class labels ('True' or 'False')
predicted_labels = (predictions > 0.5).astype(bool)

# Convert boolean labels to 'True' and 'False' strings
predicted_labels_str = np.where(predicted_labels, True, False)

# Add the predicted labels to the new_data DataFrame
reddit_before["Predicted_Labels"] = predicted_labels_str

Tensor("model/Round:0", shape=(32, 1), dtype=float32)


In [None]:

# Count the occurrences of each value in the 'column_name' column
value_counts = reddit_before['Predicted_Labels'].value_counts()

# Print the counts
print("Count of 'True':", value_counts[True])
print("Count of 'False':", value_counts[False])

# Calculate percentages
percent_true = (value_counts[True] / len(reddit_before)) * 100
percent_false = (value_counts[False] / len(reddit_before)) * 100

# Print percentages
print("% True:", percent_true)
print("% False:", percent_false)

Count of 'True': 80
Count of 'False': 16
% True: 83.33333333333334
% False: 16.666666666666664


In [None]:
reddit_after = pd.read_csv('/content/after_top_100_final.csv', encoding='ISO-8859-1')
reddit_after = reddit_after.dropna(subset=['title', 'text'])


# Assuming your new data is stored in a DataFrame named 'new_data'
# Combine 'headline' and 'text' columns
reddit_after["Article"] = reddit_after["title"] + " " + reddit_after["text"]


In [None]:
nltk.download('words')

from nltk.corpus import words

# Your array of text
article_array = reddit_after["Article"]

# Set of English words
english_word_set = set(words.words())

# Function to filter words based on the English dictionary
def filter_english_words(article):
    if isinstance(article, str):  # Check if the value is a string
        return ' '.join([word.lower() for word in article.split() if word.lower() in english_word_set])
    else:
        return ''  # Return an empty string for non-string values

# Apply the filter to each article in the array
filtered_articles = [filter_english_words(article) for article in article_array]

# Update the original array with the filtered results
reddit_after['Clean Text'] = filtered_articles

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
# Tokenization
X_new = tokenizer.texts_to_sequences(filtered_articles)
X_new = pad_sequences(X_new, maxlen=500)

# Predict labels for the new data using the trained model
predictions = model.predict(X_new)

# Convert probabilities to class labels ('True' or 'False')
predicted_labels = (predictions > 0.5).astype(bool)

# Convert boolean labels to 'True' and 'False' strings
predicted_labels_str = np.where(predicted_labels, True, False)

# Add the predicted labels to the new_data DataFrame
reddit_after["Predicted_Labels"] = predicted_labels_str

Tensor("model/Round:0", shape=(None, 1), dtype=float32)


In [None]:
# Count the occurrences of each value in the 'column_name' column
value_counts = reddit_after['Predicted_Labels'].value_counts()

# Print the counts
print("Count of 'True':", value_counts[True])
print("Count of 'False':", value_counts[False])
# Calculate percentages
percent_true = (value_counts[True] / len(reddit_after)) * 100
percent_false = (value_counts[False] / len(reddit_after)) * 100

# Print percentages
print("% True:", percent_true)
print("% False:", percent_false)



Count of 'True': 68
Count of 'False': 20
% True: 77.27272727272727
% False: 22.727272727272727
