In [20]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import display

# Set CUDA_LAUNCH_BLOCKING for better error messages
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

def load_datasets(true_path, fake_path):
    true_df = pd.read_csv(true_path)
    fake_df = pd.read_csv(fake_path)
    true_texts = true_df['text'].tolist()
    fake_texts = fake_df['text'].tolist()
    texts = true_texts + fake_texts
    labels = [1] * len(true_texts) + [0] * len(fake_texts)
    return texts, labels

def create_dataloader(texts, tokenizer, max_length, batch_size):
    dataset = TextDataset(texts, tokenizer, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def generate_text_gpt2(model, tokenizer, prompt, max_new_tokens=50):
    try:
        inputs = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True, max_length=1024, truncation=True)
        attention_mask = torch.ones(inputs.shape)
        
        # Check tensor dimensions and values before moving to device
        if inputs.shape[1] > 1024:
            raise ValueError(f"Input sequence length exceeds the maximum length of 1024 tokens: {inputs.shape[1]}")
        
        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)
        
        outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=max_new_tokens, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error generating text for input: {prompt[:50]}...: {e}")
        return None

def generate_and_add_texts_to_dataset(model, tokenizer, texts, labels, label, max_new_tokens=50, subset_size=100):
    new_texts = []
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(generate_text_gpt2, model, tokenizer, text, max_new_tokens)
            for text in texts[:subset_size]
        ]
        for idx, future in enumerate(as_completed(futures)):
            try:
                generated_text = future.result()
                if generated_text:
                    new_texts.append(generated_text)
                    labels.append(label)
                else:
                    print(f"Skipped text at index {idx} due to generation error.")
            except Exception as e:
                print(f"Error processing text at index {idx}: {e}")
    texts.extend(new_texts)
    return texts, labels

if __name__ == "__main__":
    try:
        # Paths to the datasets
        true_path = '/kaggle/input/fake-news-dataset/True.csv'
        fake_path = '/kaggle/input/fake-news-dataset/Fake.csv'

        # Load datasets
        texts, labels = load_datasets(true_path, fake_path)

        # Initialize tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        max_length = 128

        # Create DataLoader
        batch_size = 16
        dataloader = create_dataloader(texts, tokenizer, max_length, batch_size)

        # Ensure using CPU for debugging
        device = torch.device('cpu')
        print(f"Using device: {device}")

        # Initialize the model
        model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
        model.eval()

        # Process a controlled subset of data
        subset_size =  500 # Process only 100 records for faster execution

        # Generate new texts based on original texts and add them to the dataset
        texts, labels = generate_and_add_texts_to_dataset(model, tokenizer, texts, labels, label=1, max_new_tokens=50, subset_size=subset_size)
        texts, labels = generate_and_add_texts_to_dataset(model, tokenizer, texts, labels, label=0, max_new_tokens=50, subset_size=subset_size)

        # Verify the new dataset size
        print(f"Total texts: {len(texts)}")
        print(f"Total labels: {len(labels)}")

        # Create DataLoader with updated dataset
        dataloader = create_dataloader(texts, tokenizer, max_length, batch_size)

        # Convert to DataFrame to display
        df = pd.DataFrame({'text': texts, 'label': labels})
        display(df.head())  # Display the first few rows of the updated dataset
    except Exception as e:
        print(f"An error occurred: {e}")


Using device: cpu
Error generating text for input: SEATTLE/WASHINGTON (Reuters) - President Donald Tr...: index out of range in self
Skipped text at index 0 due to generation error.
Error generating text for input: NEW YORK (Reuters) - The U.S. Justice Department h...: index out of range in self
Skipped text at index 18 due to generation error.
Error generating text for input: LIMA (Reuters) - Peru’s President Pedro Pablo Kucz...: index out of range in self
Skipped text at index 25 due to generation error.
Error generating text for input:  KING OF PRUSSIA, Pennsylvania/WASHINGTON (Reuters...: index out of range in self
Skipped text at index 31 due to generation error.
Error generating text for input: (Reuters) - The U.S. State Department has told ref...: index out of range in self
Skipped text at index 34 due to generation error.
Error generating text for input: WASHINGTON (Reuters) - The Republican-controlled U...: index out of range in self
Skipped text at index 47 due to generation 

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [27]:
# newly generated data
df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Drop blank rows if any
df.dropna(inplace=True)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text']).toarray()

# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# adaboost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

# Initialize and train AdaBoost classifier
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train, y_train)

# Predictions and evaluation
y_pred = ada.predict(X_test)
print("AdaBoost Classification Report:\n", classification_report(y_test, y_pred))


AdaBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      4798
           1       0.98      0.99      0.98      4361

    accuracy                           0.98      9159
   macro avg       0.98      0.98      0.98      9159
weighted avg       0.98      0.98      0.98      9159



In [35]:
# GBM
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train GBM classifier
gbm = GradientBoostingClassifier(n_estimators=100)
gbm.fit(X_train, y_train)

# Predictions and evaluation
y_pred = gbm.predict(X_test)
print("GBM Classification Report:\n", classification_report(y_test, y_pred))


GBM Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      4798
           1       0.97      0.99      0.98      4361

    accuracy                           0.98      9159
   macro avg       0.98      0.98      0.98      9159
weighted avg       0.98      0.98      0.98      9159



In [36]:
# XGBM
import xgboost as xgb

# Initialize and train XGBoost classifier
xgb_model = xgb.XGBClassifier(n_estimators=100)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = xgb_model.predict(X_test)
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred))


XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4798
           1       0.98      0.98      0.98      4361

    accuracy                           0.98      9159
   macro avg       0.98      0.98      0.98      9159
weighted avg       0.98      0.98      0.98      9159



In [38]:
# CNN 4 layers
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# Tokenizing and padding sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['text'])
X_seq = tokenizer.texts_to_sequences(df['text'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Train-test split for CNN
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Build 4-layer CNN model
model_4 = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_4.summary()

# Train the model
model_4.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model_4.evaluate(X_test, y_test)
print(f'4-Layer CNN Accuracy: {accuracy}')


2024-05-30 12:10:33.042788: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 12:10:33.042891: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 12:10:33.190491: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/5
[1m 56/916[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 3ms/step - accuracy: 0.5875 - loss: 0.6319

I0000 00:00:1717071079.303440     801 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8985 - loss: 0.2080 - val_accuracy: 0.9707 - val_loss: 0.0896
Epoch 2/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9824 - loss: 0.0576 - val_accuracy: 0.9752 - val_loss: 0.0854
Epoch 3/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9852 - loss: 0.0409 - val_accuracy: 0.9741 - val_loss: 0.0915
Epoch 4/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9897 - loss: 0.0205 - val_accuracy: 0.9728 - val_loss: 0.1294
Epoch 5/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9896 - loss: 0.0198 - val_accuracy: 0.9683 - val_loss: 0.1473
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9666 - loss: 0.1581
4-Layer CNN Accuracy: 0.9665902256965637


In [39]:
# CNN 6 layers
# Build 6-layer CNN model
model_6 = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_6.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_6.summary()

# Train the model
model_6.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model_6.evaluate(X_test, y_test)
print(f'6-Layer CNN Accuracy: {accuracy}')


Epoch 1/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8622 - loss: 0.2652 - val_accuracy: 0.9763 - val_loss: 0.0814
Epoch 2/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9818 - loss: 0.0624 - val_accuracy: 0.9737 - val_loss: 0.0901
Epoch 3/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9876 - loss: 0.0443 - val_accuracy: 0.9739 - val_loss: 0.0891
Epoch 4/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9882 - loss: 0.0326 - val_accuracy: 0.9731 - val_loss: 0.0995
Epoch 5/5
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9893 - loss: 0.0251 - val_accuracy: 0.9741 - val_loss: 0.1192
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9689 - loss: 0.1178
6-Layer CNN Accuracy: 0.9694289565086365
