In [40]:
#  Importing Libraries

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer        # ignore warnings
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [41]:
# Importing the Dataset

data = pd.read_csv("news.csv")


In [42]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [43]:
# Preprocessing 

''' 
dataset contain 1 un-named col remove that
'''
data = data.drop(["Unnamed: 0"], axis=1)
data.head(10)

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [44]:
#  Data Encoding

''' 
It converts the categorical column (label in our case) into numerical values.

le.fit(data['label']): Fits the encoder on the 'label' column to learn the unique categories.
data['label'] = le.transform(data['label']):
Transforms the categorical labels into numerical format (0 for REAL, 1 for FAKE).

'''


le = preprocessing.LabelEncoder()
le.fit(data['label'])
data['label'] = le.transform(data['label'])

In [45]:
# Variables Setup

''' 
These are some variables needed to be setup for the model training.
'''

embedding_dim = 50
max_length = 54
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
training_size = 3000
test_portion = 0.1


''' 

| **Variable**            | **Purpose**                                                                   |
| ----------------------- | ----------------------------------------------------------------------------- |
| `embedding_dim = 50`    | Sets the size of word vectors (from GloVe). Each word becomes a 50D vector.   |
| `max_length = 54`       | Maximum number of tokens per text (e.g., title). Used for padding/truncating. |
| `padding_type = 'post'` | Adds zeros at the end of sequences shorter than `max_length`.                 |
| `trunc_type = 'post'`   | Cuts off extra tokens at the end of long sequences.                           |
| `oov_tok = "<OOV>"`     | Special token for words not seen during training (Out-Of-Vocabulary).         |
| `training_size = 3000`  | Uses only the first 3000 samples for training/testing (for speed/memory).     |
| `test_portion = 0.1`    | 10% of `training_size` is used as a validation/test set (300 samples).        |

'''

' \n\n| **Variable**            | **Purpose**                                                                   |\n| ----------------------- | ----------------------------------------------------------------------------- |\n| `embedding_dim = 50`    | Sets the size of word vectors (from GloVe). Each word becomes a 50D vector.   |\n| `max_length = 54`       | Maximum number of tokens per text (e.g., title). Used for padding/truncating. |\n| `padding_type = \'post\'` | Adds zeros at the end of sequences shorter than `max_length`.                 |\n| `trunc_type = \'post\'`   | Cuts off extra tokens at the end of long sequences.                           |\n| `oov_tok = "<OOV>"`     | Special token for words not seen during training (Out-Of-Vocabulary).         |\n| `training_size = 3000`  | Uses only the first 3000 samples for training/testing (for speed/memory).     |\n| `test_portion = 0.1`    | 10% of `training_size` is used as a validation/test set (300 samples).        |\n\n'

In [46]:
# Tokenization

''' 
This process divides a large piece of continuous text into distinct units or tokens. 
Here we use columns separately for a temporal basis as a pipeline just for good accuracy.

tokenizer1.fit_on_texts(title): Fits the tokenizer on the 'title' column to create a vocabulary.
pad_sequences(sequences1): Pads the sequences to ensure they all have the same length.

'''

title = []
text = []
labels = []

for x in range(training_size):
    title.append(data['title'][x])
    text.append(data['text'][x])
    labels.append(data['label'][x])

tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(title)
word_index1 = tokenizer1.word_index
vocab_size1 = len(word_index1)
sequences1 = tokenizer1.texts_to_sequences(title)
padded1 = pad_sequences(sequences1, padding=padding_type, truncating=trunc_type)


In [47]:


# Splitting Data for Training and Testing

''' 

training_sequences1, test_sequences1: Splits the tokenized and padded data into training and testing sets.
training_labels, test_labels: Splits the corresponding labels into training and testing labels.

'''

split = int(test_portion * training_size)
training_sequences1 = padded1[split: training_size]
test_sequences1 = padded1[0:split]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

''' 

| Line of Code                                | Purpose                                 | Calculation / Explanation                    |
| ------------------------------------------- | --------------------------------------- | -------------------------------------------- |
| `split = int(test_portion * training_size)` | Calculates how many samples for testing | `split = int(0.1 * 3000) = 300`              |
| `training_sequences1 = padded1[split:]`     | Gets training input sequences           | `padded1[300:3000]` → 2700 samples           |
| `test_sequences1 = padded1[:split]`         | Gets test input sequences               | `padded1[0:300]` → first 300 samples         |
| `test_labels = labels[:split]`              | Gets labels for the test set            | `labels[0:300]` → first 300 labels           |
| `training_labels = labels[split:]`          | Gets labels for the training set        | `labels[300:3000]` → labels for 2700 samples |


'''

' \n\n| Line of Code                                | Purpose                                 | Calculation / Explanation                    |\n| ------------------------------------------- | --------------------------------------- | -------------------------------------------- |\n| `split = int(test_portion * training_size)` | Calculates how many samples for testing | `split = int(0.1 * 3000) = 300`              |\n| `training_sequences1 = padded1[split:]`     | Gets training input sequences           | `padded1[300:3000]` → 2700 samples           |\n| `test_sequences1 = padded1[:split]`         | Gets test input sequences               | `padded1[0:300]` → first 300 samples         |\n| `test_labels = labels[:split]`              | Gets labels for the test set            | `labels[0:300]` → first 300 labels           |\n| `training_labels = labels[split:]`          | Gets labels for the training set        | `labels[300:3000]` → labels for 2700 samples |\n\n\n'

In [48]:
# Reshaping Data for LSTM

''' 
We will be using LSTM(Long Short Term Memory) model for prediction and for that we 
need to reshape padded sequence. We are converting it into np.array() as we need training
and test sequences into NumPy arrays which are required by TensorFlow models.
'''

training_sequences1 = np.array(training_sequences1)
test_sequences1 = np.array(test_sequences1)

In [49]:
# Generating Word Embedding

''' 
Embeddings allows words with similar meanings to have a similar representation.
Here each individual word is represented as real-valued vectors in a predefined 
vector space. For that we will be using glove.6B.50d.txt.

!wget: Downloads the pre-trained GloVe embeddings from the following link.
!unzip: Unzips the downloaded file containing the GloVe embeddings

'''

import os
import zipfile
import urllib.request


# Constants
glove_zip = "glove.6B.zip"
glove_url = "https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip"
glove_file = "glove.6B.50d.txt"  # Make sure this matches your embedding_dim
embedding_dim = 50  # Set according to the GloVe file (50d, 100d, etc.)



if not os.path.exists(glove_zip):
    print("Downloading GloVe vectors....")
    urllib.request.urlretrieve(glove_url, glove_zip)
    print("Downloaded.")

if not os.path.exists(glove_file):
    print("Extracting GloVe vectors...")
    with zipfile.ZipFile(glove_zip,'r') as zip_ref:
        zip_ref.extractall()
    print("Extracted.")


In [51]:

# Step 3: Load embeddings into a dictionary

embedding_index = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs



# Step 4: Create the embedding matrix

embedding_matrix = np.zeros((vocab_size1 + 1 , embedding_dim))
for word , i in word_index1.items():
    if i < vocab_size1:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

print("Embedding matrix created sucessfully...")



Embedding matrix created sucessfully...


In [52]:
#  Model Architecture

'''  

Here we use the TensorFlow embedding technique with Keras Embedding Layer where we map 
original input data into some set of real-valued dimensions.

Embedding: The embedding layer uses pre-trained GloVe embeddings.
Conv1D: A 1D convolutional layer to detect patterns in the text.
LSTM(64): An LSTM layer to capture long-term dependencies in the data.

'''

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size1 + 1, embedding_dim, input_length=max_length,
                              weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64,5,activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1,activation='sigmoid')

])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 54, 50)            377600    
                                                                 
 dropout (Dropout)           (None, 54, 50)            0         
                                                                 
 conv1d (Conv1D)             (None, 50, 64)            16064     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 12, 64)           0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                        

In [54]:

''' 
model architecture is ready we can use this to train our model
'''

#  Training the Model

history = model.fit(
    training_sequences1,
    np.array(training_labels),
    epochs=50,
    validation_data=(test_sequences1, np.array(test_labels)),
    verbose=2
)

Epoch 1/50
85/85 - 9s - loss: 0.6460 - accuracy: 0.6126 - val_loss: 0.5955 - val_accuracy: 0.6633 - 9s/epoch - 108ms/step
Epoch 2/50
85/85 - 1s - loss: 0.5948 - accuracy: 0.6774 - val_loss: 0.5493 - val_accuracy: 0.7067 - 1s/epoch - 13ms/step
Epoch 3/50
85/85 - 1s - loss: 0.5542 - accuracy: 0.7189 - val_loss: 0.5307 - val_accuracy: 0.6867 - 1s/epoch - 13ms/step
Epoch 4/50
85/85 - 1s - loss: 0.5127 - accuracy: 0.7474 - val_loss: 0.4914 - val_accuracy: 0.7200 - 1s/epoch - 13ms/step
Epoch 5/50
85/85 - 1s - loss: 0.4754 - accuracy: 0.7759 - val_loss: 0.4818 - val_accuracy: 0.7367 - 1s/epoch - 12ms/step
Epoch 6/50
85/85 - 1s - loss: 0.4141 - accuracy: 0.8167 - val_loss: 0.4915 - val_accuracy: 0.7533 - 1s/epoch - 12ms/step
Epoch 7/50
85/85 - 1s - loss: 0.3859 - accuracy: 0.8293 - val_loss: 0.4781 - val_accuracy: 0.7700 - 1s/epoch - 12ms/step
Epoch 8/50
85/85 - 1s - loss: 0.3419 - accuracy: 0.8496 - val_loss: 0.4851 - val_accuracy: 0.7700 - 1s/epoch - 12ms/step
Epoch 9/50
85/85 - 1s - loss: 0

In [63]:

# Sample Prediction

X = "trump was president in 2017"

sequences = tokenizer1.texts_to_sequences([X])
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
if model.predict(sequences, verbose=0)[0][0] >= 0.5:
    print("This news is True")
else:
    print("This news is False")





This news is True


In [68]:
# Sample Prediction

X = "trump is elected for pressident for africa  "

sequences = tokenizer1.texts_to_sequences([X])
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
if model.predict(sequences, verbose=0)[0][0] >= 0.5:
    print("This news is True")
else:
    print("This news is False")

This news is False


In [69]:
import pickle

# Save the model
model.save("fake_news_model.h5")

# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer1, handle, protocol=pickle.HIGHEST_PROTOCOL)
