In [10]:
# Mount Google Drive 
from google.colab import drive
drive.mount('/content/drive')

# Set Working Directory
%cd /content/drive/MyDrive/dataset/fakenewsnet/

!pip install transformers
!pip install sentencepiece

import warnings
warnings.filterwarnings("ignore")

# Importing Libraries

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Text Processing 
import re
import string
import nltk
from nltk.tokenize import word_tokenize
import regex
from wordcloud import WordCloud
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer
from string import punctuation
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load an pre-process the data

df_gossipcop_fake = pd.read_csv("gossipcop_fake.csv")
df_gossipcop_real = pd.read_csv("gossipcop_real.csv")
df_politifact_fake = pd.read_csv("politifact_fake.csv")
df_politifact_real = pd.read_csv("politifact_real.csv")

# Generate labels True/Fake under new Target Column 

df_gossipcop_fake['label'] = ['Fake']*len(df_gossipcop_fake)
df_gossipcop_real['label'] = ['Real']*len(df_gossipcop_real)
df_politifact_fake['label'] = ['Fake']*len(df_politifact_fake)
df_politifact_real['label'] = ['Real']*len(df_politifact_real)

# Merge the four separate dataframes, by random mixing into a single dataframe called 'data'

data = pd.concat([df_gossipcop_fake, df_gossipcop_real], axis=0).sample(frac=1).reset_index(drop=True)

# Target column is made of string values True/Fake, let's change it to numbers 0/1 (Fake=0, Real=1)

data['label'] = pd.get_dummies(data.label)['Real']

# Delete columns not needed

data = data.drop(["id", "news_url", "tweet_ids"], axis=1)

# Define the preprocessing function

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Replace 'not' for 'n't'
    text = re.sub(r"(\w+)n\'t", '\g<1> not', text)

    # Remove '@name'
    text = re.sub(r'@\w+', '', text)

    # Remove punctuation except '?'
    text = re.sub(r'[^\w\s\?]', '', text)

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s\?]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words except 'not' and 'can'
    stop_words = set(stopwords.words('english'))
    stop_words.discard('not')
    stop_words.discard('can')
    tokens = [token for token in tokens if token not in stop_words]

    # Remove trailing whitespaces
    tokens = [token.strip() for token in tokens]

    return ' '.join(tokens)


    # Identify the text column(s) to preprocess
text_columns = ['title']

# Preprocess each text column and store the results in new columns
for column in text_columns:
    # Create a new column name for the preprocessed text
    new_column = column + '_preprocessed'

# Apply the preprocessing function to the text column and store the results in the new column
data[new_column] = data['title'].apply(preprocess_text)


import torch
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer
from torch.utils.data import DataLoader, TensorDataset

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert the text data to a list of strings
train_texts = train_data['title_preprocessed'].tolist()
train_labels = train_data['label'].tolist()
val_texts = val_data['title_preprocessed'].tolist()
val_labels = val_data['label'].tolist()


# Load the XLNet tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Convert your data to PyTorch tensors
train_texts_tokens = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
train_labels = torch.tensor(train_labels)
val_texts_tokens = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_labels = torch.tensor(val_labels)

# Create PyTorch DataLoader objects for your datasets
train_dataset = TensorDataset(train_texts_tokens['input_ids'], train_texts_tokens['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = TensorDataset(val_texts_tokens['input_ids'], val_texts_tokens['attention_mask'], val_labels)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



from transformers import XLNetForSequenceClassification, AdamW

# Initialize the model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 5

# Set up the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Set up the device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Train the model
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Evaluate the model
    from sklearn.metrics import precision_score, recall_score, f1_score
    num_correct = 0
    num_total = 0
    y_true = []
    y_pred = []
    for batch in test_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        num_correct += (predictions == labels).sum().item()
        num_total += labels.size(0)
    
        y_true.extend(labels.tolist())
        y_pred.extend(predictions.tolist())

    accuracy = num_correct / num_total
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 score: {:.4f}'.format(epoch+1, loss.item(), accuracy, precision, recall, f1))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/dataset/fakenewsnet
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and a

Epoch: 1, Loss: 0.5719, Accuracy: 0.8478, Precision: 0.7953, Recall: 0.7683, F1 score: 0.7802
Epoch: 2, Loss: 0.4247, Accuracy: 0.8439, Precision: 0.7843, Recall: 0.7874, F1 score: 0.7858
Epoch: 3, Loss: 0.0362, Accuracy: 0.8455, Precision: 0.8088, Recall: 0.7331, F1 score: 0.7591
Epoch: 4, Loss: 0.3088, Accuracy: 0.8469, Precision: 0.8287, Recall: 0.7173, F1 score: 0.7496
Epoch: 5, Loss: 0.0085, Accuracy: 0.8471, Precision: 0.7973, Recall: 0.7594, F1 score: 0.7751
