In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os



  from .autonotebook import tqdm as notebook_tqdm


### ADD Dropout layer

In [2]:
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 3

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            # Add dropout layer
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [11]:
# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # adjust num_labels
model = BertClassifier()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Load and preprocess data
def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename)) as f:
                file_data = json.load(f)
                for article in file_data['articles']:
                    data.append((article['title'] + ' ' + article['content'], file_data['label_text']))
    return data

In [17]:
train_data = load_data('../final_project/datasets/dataset_fake_news_task4/train_json')  
test_data = load_data('../final_project/datasets/dataset_fake_news_task4/dev_json')  

# Convert labels to integers
label_mapping = {'low': 0, 'mixed': 1, 'high': 2}
train_data = [(text, label_mapping[label]) for text, label in train_data]
test_data = [(text, label_mapping[label]) for text, label in test_data]
print(train_data[0])

('Joe Biden’s Lying Anti-Trump Ad is STILL on Twitter: Gets Highest Fake News Rating of “Four Pinocchios” Presidential candidate Joe Biden has a well-known history of lying and plagiarism .\nIt looks like his campaign is getting in on the act with their latest ad that just got the worst fake news rating possible .\nThe Washington Post gave Biden ’ s new ad “ Four Pinocchios ” for “ manipulating video ” to make it appear as though President Trump called the coronavirus a hoax .\nThe Biden campaign cut out over 120 words in between the word “ coronavirus ” and then “ This is their new hoax.\n” ( see transcript below ) In  saying “ coronavirus , ” followed immediately by “ This is their new hoax.\n” What the president was saying is that the Democratic politicization of the coronavirus is a “ hoax ” and NOT the virus itself .\nThe ad goes on to show images and words that are disconnected and made to make it seem like the president said “ The American Dream ” … ” is dead.\n” This is gutter 

#### CSV

In [5]:
# Load training and test data
train_dataset = pd.read_csv('../final_project/datasets/train_dataset.csv')
dev_dataset = pd.read_csv('../final_project/datasets/dev_dataset.csv')

# Preprocess the data
train_data = [(text, label) for text, label in zip(train_dataset['text'], train_dataset['label'])]
test_data = [(text, label) for text, label in zip(dev_dataset['text'], dev_dataset['label'])]


In [6]:
# Tokenize data and convert to tensors
train_encodings = tokenizer([text for text, label in train_data], truncation=True, padding=True, max_length=512)
train_labels = torch.tensor([label for text, label in train_data])
test_encodings = tokenizer([text for text, label in test_data], truncation=True, padding=True, max_length=512)
test_labels = torch.tensor([label for text, label in test_data])


In [7]:
# Create data loaders
train_encodings['input_ids'] = torch.tensor(train_encodings['input_ids'])
train_encodings['attention_mask'] = torch.tensor(train_encodings['attention_mask'])
test_encodings['input_ids'] = torch.tensor(test_encodings['input_ids'])
test_encodings['attention_mask'] = torch.tensor(test_encodings['attention_mask'])
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [12]:
# Calculate class weights
class_counts = np.bincount(train_labels)
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
class_weights = class_weights.to(device)
print(class_weights)

tensor([0.0009, 0.0005, 0.0002], device='cuda:0')


In [13]:
# Define loss function and optimizer
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [14]:
# Bert with dropout and CSV format
epochs = 10
# Train model
for epoch in range(epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Epoch {}".format(epoch+1))
    for i, batch in progress_bar:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Calculate running loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # Update progress bar
        progress_bar.set_postfix({'loss': running_loss/(i+1), 'accuracy': 100. * correct / total})

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.2f}, Accuracy: {epoch_acc:.2f}%")
    # Evaluate model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    test_acc = 100 * correct / total
    print(f"Test Accuracy: {test_acc:.2f}%")

Epoch 1: 100%|██████████| 534/534 [05:32<00:00,  1.61it/s, loss=1.1, accuracy=37.1] 


Epoch 1/10, Loss: 1.10, Accuracy: 37.07%


Epoch 2: 100%|██████████| 534/534 [05:31<00:00,  1.61it/s, loss=1.08, accuracy=44.7]


Epoch 2/10, Loss: 1.08, Accuracy: 44.70%


Epoch 3: 100%|██████████| 534/534 [05:32<00:00,  1.61it/s, loss=0.978, accuracy=51.9]


Epoch 3/10, Loss: 0.98, Accuracy: 51.85%


Epoch 4: 100%|██████████| 534/534 [05:32<00:00,  1.61it/s, loss=0.772, accuracy=64.1]


Epoch 4/10, Loss: 0.77, Accuracy: 64.06%


Epoch 5: 100%|██████████| 534/534 [05:33<00:00,  1.60it/s, loss=0.532, accuracy=73.9]


Epoch 5/10, Loss: 0.53, Accuracy: 73.92%


Epoch 6: 100%|██████████| 534/534 [05:32<00:00,  1.61it/s, loss=0.32, accuracy=85.7] 


Epoch 6/10, Loss: 0.32, Accuracy: 85.74%


Epoch 7: 100%|██████████| 534/534 [05:32<00:00,  1.60it/s, loss=0.169, accuracy=93.9]


Epoch 7/10, Loss: 0.17, Accuracy: 93.92%


Epoch 8: 100%|██████████| 534/534 [05:32<00:00,  1.61it/s, loss=0.115, accuracy=96.5] 


Epoch 8/10, Loss: 0.11, Accuracy: 96.53%


Epoch 9: 100%|██████████| 534/534 [05:32<00:00,  1.61it/s, loss=0.0745, accuracy=98.1]


Epoch 9/10, Loss: 0.07, Accuracy: 98.15%


Epoch 10: 100%|██████████| 534/534 [05:32<00:00,  1.61it/s, loss=0.081, accuracy=97.6] 


Epoch 10/10, Loss: 0.08, Accuracy: 97.56%
Test Accuracy: 45.20%


### Data experiment

In [8]:
import nltk
from nltk.corpus import wordnet
import random

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def get_synonyms(word):
    """
    获取一个词的同义词
    """
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(sentence, num_replacement=1):
    """
    在句子中随机选择 num_replacement 个词，并用它们的同义词替换
    """
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in nltk.corpus.stopwords.words('english')]))
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= num_replacement: 
            break

    sentence = ' '.join(new_words)
    return sentence

In [10]:
augmented_data = []
train_json_folder = '../final_project/datasets/dataset_fake_news_task4/train_json'

# Loop through each JSON file in the directory
for filename in os.listdir(train_json_folder):
    if filename.endswith('.json'):
        with open(os.path.join(train_json_folder, filename)) as f:
            file_data = json.load(f)
            for article in file_data['articles']:
                # Apply synonym replacement to the title and content of each article
                title = synonym_replacement(article['title'])
                content = synonym_replacement(article['content'])
                augmented_data.append({'title': title, 'content': content, 'label': file_data['label']})

# Now, augmented_data contains your training data after synonym replacement
# You can write it back to JSON files, or directly use it for training your model
print(f"Number of augmented articles: {len(augmented_data)}")

Number of augmented articles: 8532


In [11]:
# 加载增强后的数据
train_data = augmented_data

# 将标题和内容合并为一个字段
train_data = [(item['title'] + ' ' + item['content'], item['label']) for item in train_data]


In [16]:
print(augmented_data[0])

{'title': 'Joe Biden’s Lying Anti-Trump Ad is STILL on Twitter: Gets mellow Fake News Rating of “Four Pinocchios”', 'content': 'Presidential candidate Joe Biden has a well-known history of lying and plagiarism . It looks like his campaign is getting in on the act with their latest ad that just got the worst fake news rating possible . The Washington Post gave Biden ’ s new ad “ Four Pinocchios ” for “ manipulating video ” to make it appear as though President Trump called the coronavirus a hoax . The Biden campaign cut out over 120 words in between the word “ coronavirus ” and then “ This is their new hoax. ” ( see transcript below ) In saying “ coronavirus , ” followed immediately by “ This is their new hoax. ” What the president was saying is that the Democratic politicization of the coronavirus is a “ hoax ” and NOT the virus itself . The ad goes on to show images and words that are disconnected and made to make it seem like the president said “ The American Dream ” … ” is dead. ” T

### Training

In [12]:
# Tokenize data and convert to tensors
train_encodings = tokenizer([text for text, label in train_data], truncation=True, padding=True, max_length=512)
train_labels = torch.tensor([label for text, label in train_data])
test_encodings = tokenizer([text for text, label in test_data], truncation=True, padding=True, max_length=512)
test_labels = torch.tensor([label for text, label in test_data])


In [13]:
# Create data loaders
train_encodings['input_ids'] = torch.tensor(train_encodings['input_ids'])
train_encodings['attention_mask'] = torch.tensor(train_encodings['attention_mask'])
test_encodings['input_ids'] = torch.tensor(test_encodings['input_ids'])
test_encodings['attention_mask'] = torch.tensor(test_encodings['attention_mask'])
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [12]:
# Define loss function and optimizer
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [15]:
# Bert with dropout and data augment
epochs = 5
# Train model
for epoch in range(epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Epoch {}".format(epoch+1))
    for i, batch in progress_bar:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Calculate running loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # Update progress bar
        progress_bar.set_postfix({'loss': running_loss/(i+1), 'accuracy': 100. * correct / total})

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.2f}, Accuracy: {epoch_acc:.2f}%")
    # Evaluate model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    test_acc = 100 * correct / total
    print(f"Test Accuracy: {test_acc:.2f}%")

Epoch 1: 100%|██████████| 534/534 [05:59<00:00,  1.49it/s, loss=0.855, accuracy=62.8]


Epoch 1/5, Loss: 0.85, Accuracy: 62.83%


Epoch 2: 100%|██████████| 534/534 [05:57<00:00,  1.49it/s, loss=0.698, accuracy=69]  


Epoch 2/5, Loss: 0.70, Accuracy: 69.03%


Epoch 3: 100%|██████████| 534/534 [05:59<00:00,  1.49it/s, loss=0.458, accuracy=81.7]


Epoch 3/5, Loss: 0.46, Accuracy: 81.69%


Epoch 4: 100%|██████████| 534/534 [06:41<00:00,  1.33it/s, loss=0.253, accuracy=91.4]


Epoch 4/5, Loss: 0.25, Accuracy: 91.39%


Epoch 5: 100%|██████████| 534/534 [05:55<00:00,  1.50it/s, loss=0.131, accuracy=96.4]


Epoch 5/5, Loss: 0.13, Accuracy: 96.39%
Test Accuracy: 57.73%


In [20]:
# Bert with dropout
epochs = 5
# Train model
for epoch in range(epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Epoch {}".format(epoch+1))
    for i, batch in progress_bar:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Calculate running loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # Update progress bar
        progress_bar.set_postfix({'loss': running_loss/(i+1), 'accuracy': 100. * correct / total})

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.2f}, Accuracy: {epoch_acc:.2f}%")
    # Evaluate model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    test_acc = 100 * correct / total
    print(f"Test Accuracy: {test_acc:.2f}%")

Epoch 1: 100%|██████████| 534/534 [06:17<00:00,  1.41it/s, loss=0.859, accuracy=61.7]


Epoch 1/5, Loss: 0.86, Accuracy: 61.71%


Epoch 2: 100%|██████████| 534/534 [05:37<00:00,  1.58it/s, loss=0.698, accuracy=69.7]


Epoch 2/5, Loss: 0.70, Accuracy: 69.67%


Epoch 3: 100%|██████████| 534/534 [05:37<00:00,  1.58it/s, loss=0.462, accuracy=82.6]


Epoch 3/5, Loss: 0.46, Accuracy: 82.58%


Epoch 4: 100%|██████████| 534/534 [05:37<00:00,  1.58it/s, loss=0.216, accuracy=93.7]


Epoch 4/5, Loss: 0.22, Accuracy: 93.68%


Epoch 5: 100%|██████████| 534/534 [05:37<00:00,  1.58it/s, loss=0.0971, accuracy=97.9]


Epoch 5/5, Loss: 0.10, Accuracy: 97.86%
Test Accuracy: 63.59%


In [20]:
# First Trained result
epochs = 10
# Train model
for epoch in range(epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Epoch {}".format(epoch+1))
    for i, batch in progress_bar:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Calculate running loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs.logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # Update progress bar
        progress_bar.set_postfix({'loss': running_loss/(i+1), 'accuracy': 100. * correct / total})

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.2f}, Accuracy: {epoch_acc:.2f}%")
    # Evaluate model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    test_acc = 100 * correct / total
    print(f"Test Accuracy: {test_acc:.2f}%")

Epoch 1: 100%|██████████| 534/534 [07:25<00:00,  1.20it/s, loss=0.827, accuracy=62.8]


Epoch 1/10, Loss: 0.83, Accuracy: 62.75%


Epoch 2: 100%|██████████| 534/534 [06:25<00:00,  1.39it/s, loss=0.628, accuracy=73.1]


Epoch 2/10, Loss: 0.63, Accuracy: 73.05%


Epoch 3: 100%|██████████| 534/534 [06:25<00:00,  1.39it/s, loss=0.313, accuracy=88.8]


Epoch 3/10, Loss: 0.31, Accuracy: 88.76%


Epoch 4: 100%|██████████| 534/534 [07:03<00:00,  1.26it/s, loss=0.0869, accuracy=97.9]


Epoch 4/10, Loss: 0.09, Accuracy: 97.93%


Epoch 5: 100%|██████████| 534/534 [07:06<00:00,  1.25it/s, loss=0.0287, accuracy=99.4]


Epoch 5/10, Loss: 0.03, Accuracy: 99.39%


Epoch 6: 100%|██████████| 534/534 [07:04<00:00,  1.26it/s, loss=0.014, accuracy=99.7] 


Epoch 6/10, Loss: 0.01, Accuracy: 99.74%


Epoch 7: 100%|██████████| 534/534 [07:05<00:00,  1.25it/s, loss=0.0105, accuracy=99.8] 


Epoch 7/10, Loss: 0.01, Accuracy: 99.79%


Epoch 8: 100%|██████████| 534/534 [07:13<00:00,  1.23it/s, loss=0.00522, accuracy=99.9]


Epoch 8/10, Loss: 0.01, Accuracy: 99.88%


Epoch 9: 100%|██████████| 534/534 [06:31<00:00,  1.36it/s, loss=0.00264, accuracy=99.9]


Epoch 9/10, Loss: 0.00, Accuracy: 99.94%


Epoch 10: 100%|██████████| 534/534 [05:47<00:00,  1.54it/s, loss=0.00237, accuracy=100] 


Epoch 10/10, Loss: 0.00, Accuracy: 99.96%
Test Accuracy: 61.48%
