In [None]:
# Install required libraries
!pip install transformers bertviz tensorflow pandas matplotlib scikit-learn

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from bertviz import head_view, model_view
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Mount Google Drive to access datasets
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load datasets
fake = pd.read_csv('/content/drive/MyDrive/Fake_news_english/Fake.csv')
real = pd.read_csv('/content/drive/MyDrive/Fake_news_english/True.csv')

# Explore the datasets
print("Fake News Dataset Head:")
print(fake.head())

print("\nReal News Dataset Head:")
print(real.head())


In [None]:
# Visualize subject distribution in fake news
plt.figure(figsize=(6,4))
fake['subject'].value_counts().plot(kind='barh', color='red')
plt.title('Distribution of Subjects in Fake News')
plt.xlabel('Count')
plt.ylabel('Subject')
plt.show()

# Visualize subject distribution in real news
plt.figure(figsize=(6,4))
real['subject'].value_counts().plot(kind='barh', color='green')
plt.title('Distribution of Subjects in Real News')
plt.xlabel('Count')
plt.ylabel('Subject')
plt.show()


In [None]:
# Label the datasets
fake['label'] = 0  # Fake news labeled as 0
real['label'] = 1  # Real news labeled as 1

# Combine the datasets
data = pd.concat([fake, real], ignore_index=True)
print("Combined Dataset Shape:", data.shape)


In [None]:
# Drop unnecessary columns
data = data.drop(columns=['subject', 'date'])

# Combine title and text columns
data['text'] = data['title'] + ' ' + data['text']

# Keep only the necessary columns
data = data[['text', 'label']]

# Shuffle the data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

print("Processed Data Sample:")
print(data.head())


In [None]:
# Check for missing values
print("Missing values in dataset:", data.isnull().sum())


In [None]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Define maximum sequence length
MAX_LEN = 256  # You can adjust this based on your computational resources

In [None]:
# Tokenization function
def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf',
            return_token_type_ids=False
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    labels = tf.convert_to_tensor(labels)

    return input_ids, attention_masks, labels


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['text'],
    data['label'],
    test_size=0.2,
    random_state=42,
    stratify=data['label']
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


In [None]:
# Tokenize training data
train_input_ids, train_attention_masks, train_labels = tokenize_data(X_train.tolist(), y_train.tolist())

# Tokenize testing data
test_input_ids, test_attention_masks, test_labels = tokenize_data(X_test.tolist(), y_test.tolist())


In [None]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': train_input_ids,
        'attention_mask': train_attention_masks
    },
    train_labels
)).shuffle(buffer_size=10000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': test_input_ids,
        'attention_mask': test_attention_masks
    },
    test_labels
)).batch(16)


In [None]:
import tensorflow as tf
from transformers import TFBertModel

# Define the input layers
input_ids = tf.keras.Input(shape=(256,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

# Load the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Create a custom layer to encapsulate the BERT model call
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, bert_model):
        super(BertLayer, self).__init__()
        self.bert_model = bert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        return self.bert_model(input_ids, attention_mask=attention_mask)

# Create an instance of the custom layer
bert_layer = BertLayer(bert_model)

# Get the outputs from the BERT layer
bert_output = bert_layer([input_ids, attention_mask])

# Take the pooled output and add a dense layer for classification
pooled_output = bert_output.pooler_output

# Add a dropout layer for regularization
dropout_layer = tf.keras.layers.Dropout(0.3)(pooled_output)

dense_layer = tf.keras.layers.Dense(1, activation='sigmoid')(dropout_layer)

# Create the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=dense_layer)

# Print the model summary
model.summary()

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='binary_crossentropy',  # Use 'sparse_categorical_crossentropy' for multi-class
              metrics=['accuracy'])

In [None]:
# Save the model using the standard Keras method
model.save("/content/drive/MyDrive/Fake_news_english/tfbert")

# You can also save the weights separately if needed
model.save_weights("/content/drive/MyDrive/Fake_news_english/tfbert_weights.h5")

In [None]:
# Import the necessary module
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

In [None]:
# Test with a small number of epochs and smaller dataset
history = model.fit(
    train_dataset,
    validation_data=test_dataset,  # Small validation subset
    epochs=10,  # Fewer epochs
    verbose=1,

    steps_per_epoch=100
)
