In [None]:
import pandas as pd

# Load the datasets
fact_check_post_mapping = pd.read_csv("trial_data_mapping.csv")
fact_checks = pd.read_csv("trial_fact_checks.csv")
posts = pd.read_csv("trial_posts.csv")

# Create a dataset with all possible pairs
posts_expanded = pd.merge(fact_check_post_mapping[['post_id']], posts[['post_id', 'text']], on='post_id')
# fact_checks_expanded = pd.merge(fact_check_post_mapping[['fact_check_id']], fact_checks[['fact_check_id', 'claim']], on='fact_check_id')

# # Create all possible pairs between posts and fact-checks
# all_pairs = pd.merge(posts_expanded, fact_checks_expanded, how='cross')

# # Assign labels based on the mapping file
# def assign_label(row):
#     if ((row['post_id'], row['fact_check_id']) in zip(fact_check_post_mapping['post_id'], fact_check_post_mapping['fact_check_id'])):
#         return 1
#     else:
#         return 0

# all_pairs['label'] = all_pairs.apply(assign_label, axis=1)

# # Drop rows with null values in 'text' or 'claim'
# all_pairs.dropna(subset=['text', 'claim'], inplace=True)

# # Remove rows where 'text' or 'claim' are empty strings
# all_pairs = all_pairs[(all_pairs['text'].str.strip() != '') & (all_pairs['claim'].str.strip() != '')]

# # Sample negative examples if the dataset is too large
# positive_examples = all_pairs[all_pairs['label'] == 1]
# negative_examples = all_pairs[all_pairs['label'] == 0]

# # Optionally, you can downsample negative examples if they are too numerous
# negative_samples = negative_examples.sample(n=len(positive_examples), random_state=42)  # Balance dataset

# # Combine positive and sampled negative examples
# data = pd.concat([positive_examples, negative_samples])

# # Drop rows where 'text' or 'claim' is null or empty
# data.dropna(subset=['text', 'claim'], inplace=True)
# data = data[data['text'].str.strip() != '']  # Remove rows with empty strings
# data = data[data['claim'].str.strip() != '']  # Remove rows with empty strings

# # Example view of the data
print(posts_expanded.head())


     post_id                                               text  \
0         30  ('★긴급] 일본 후쿠시마 원전에서 화재 발생...일본 정부, 주변 지역에 긴급대피...   
51        21  ('같은 김정은으로 보이지 않는데.. 지금은 세상에서 일어나는 일들이 마치 연극이나...   
102        7  ('<트랜스휴먼 : 유전자변형, 하이브리드 잡종, 인간개조>\n\n코로나백신 접종은...   
153       44  ('백신 접종을 하지 않은 사람들의 불법적인 봉쇄에 최대의 힘으로 저항하려고 길로 ...   
204       37  ('코로나 바이러스를 이용한 정부의 통제에 반발해서 일어난 루마니아 국민들. 국민들...   

     fact_check_id                                              claim  label  
0               41  ('A massive fire broke out at the Fukushima nu...      1  
51              48  ('This photo shows Kim Jong-un in 2021', 'This...      1  
102             28  ('Covid-19 vaccines alter human DNA', 'Covid-1...      1  
153             12  ('This video shows an anti-lockdown rally in A...      1  
204             31  ('Photo shows anti-coronavirus restrictions pr...      1  


In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

dataset = Dataset.from_pandas(data)

# Load the mBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenization function for paired input (post + claim)
def tokenize_function(examples):
    return tokenizer(
        examples['text'],                 # Primary input (social media post)
        examples['claim'],                # Paired input (fact-check claim)
        truncation=True,                  # Truncate sequences longer than max length
        padding='max_length',             # Pad sequences to max length
        max_length=512                    # Max token length for BERT
    )

# Tokenize dataset
dataset = dataset.map(tokenize_function, batched=True)

# Example tokenized output
print(dataset[0])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'post_id': 30, 'text': "('★긴급] 일본 후쿠시마 원전에서 화재 발생...일본 정부, 주변 지역에 긴급대피령 선포★\\n', '★Emergency] A fire breaks out at the Fukushima nuclear power plant in Japan...The Japanese government declares an emergency evacuation order in the surrounding area★\\n', [('kor', 1.0)])", 'fact_check_id': 41, 'claim': "('A massive fire broke out at the Fukushima nuclear power plant after the March 16 earthquake in eastern Japan', 'A massive fire broke out at the Fukushima nuclear power plant after the March 16 earthquake in eastern Japan', [('eng', 1.0)])", 'label': 1, '__index_level_0__': 0, 'input_ids': [101, 113, 112, 1861, 70221, 37568, 166, 23130, 10003, 61156, 77901, 9612, 89326, 9993, 36210, 9323, 24017, 119, 119, 119, 23130, 9670, 14646, 117, 9689, 118985, 58939, 10530, 8933, 37568, 14423, 97146, 44220, 9428, 55530, 111744, 165, 182, 112, 117, 112, 1861, 11259, 12371, 11280, 11710, 166, 138, 13559, 68307, 10950, 10160, 10105, 17056, 87004, 20761, 13183, 16430, 10106, 11891, 119, 119, 119, 10117,

In [None]:

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Rename 'label' column to 'labels'
dataset = dataset.rename_column("label", "labels")

# Split dataset into train and test sets
train_test = dataset.train_test_split(test_size=0.2)

# Load the mBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",               # Output directory
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=8,        # Batch size for training
    per_device_eval_batch_size=8,         # Batch size for evaluation
    num_train_epochs=3,                   # Number of epochs
    weight_decay=0.01,                    # Weight decay
    logging_dir="./logs",                 # Directory for logging
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test['train'],
    eval_dataset=train_test['test'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.560809
2,No log,0.524147
3,No log,0.501934


TrainOutput(global_step=30, training_loss=0.5112916628519694, metrics={'train_runtime': 1931.9782, 'train_samples_per_second': 0.124, 'train_steps_per_second': 0.016, 'total_flos': 63146653286400.0, 'train_loss': 0.5112916628519694, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.5019336938858032, 'eval_runtime': 42.7582, 'eval_samples_per_second': 0.468, 'eval_steps_per_second': 0.07, 'epoch': 3.0}


In [None]:
import torch
from sklearn.model_selection import train_test_split

In [None]:
# Prepare DataLoader for test dataset
test_loader = torch.utils.data.DataLoader(train_test['test'], batch_size=8)


In [None]:
# Function to get predictions from the model
def get_predictions(test_dataset):
    model.eval()
    predictions = []
    true_labels = []

    for batch in test_dataset:
        inputs = tokenizer(batch['text'], batch['claim'], padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1).tolist()

        # Append predictions and true labels
        predictions.extend(predicted_labels)
        true_labels.extend(batch['labels'].tolist())

    return predictions, true_labels

In [None]:
# Get predictions
predictions, true_labels = get_predictions(test_loader)


In [None]:
# Define evaluation metrics
def mean_reciprocal_rank(y_true, y_pred):
    mrr_total = 0.0
    for true, pred in zip(y_true, y_pred):
        for rank, doc_id in enumerate(pred):
            if doc_id in true:
                mrr_total += 1 / (rank + 1)
                break
    return mrr_total / len(y_true)

def precision_at_k(y_true, y_pred, k=1):
    precision_total = 0.0
    for true, pred in zip(y_true, y_pred):
        relevant_count = len(set(true) & set(pred[:k]))
        precision_total += relevant_count / k
    return precision_total / len(y_true)


In [None]:
def convert_to_ranking(true_labels, predictions):
    # This assumes `predictions` are probabilities or scores, which are not directly usable in this form.
    # Here we treat `predictions` as if they are labels and create a list of lists for ranking.
    ranking_true = []
    ranking_pred = []

    # Assume `predictions` are direct labels for simplicity
    for true, pred in zip(true_labels, predictions):
        ranking_true.append([true])  # true labels in ranking format
        ranking_pred.append([pred])  # predicted labels in ranking format

    return ranking_true, ranking_pred

In [None]:
ranking_true, ranking_pred = convert_to_ranking(true_labels, predictions)


In [None]:
# Compute metrics
mrr_score = mean_reciprocal_rank(ranking_true, ranking_pred)
precision_at_1 = precision_at_k(ranking_true, ranking_pred, k=1)
precision_at_5 = precision_at_k(ranking_true, ranking_pred, k=5)

In [None]:
print(f"Mean Reciprocal Rank (MRR): {mrr_score:.4f}")
print(f"Precision@1: {precision_at_1:.4f}")
print(f"Precision@5: {precision_at_5:.4f}")

Mean Reciprocal Rank (MRR): 0.8500
Precision@1: 0.8500
Precision@5: 0.1700


**Summary**

Model is performing well in terms of ranking the most relevant fact-check at the top (as indicated by the high MRR and Precision@1).

However, the model's performance drops for Precision@5, indicating that while the top-ranked result is usually correct, the quality of additional top results is not as high.


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

# Ensure that `true_labels` and `predictions` are lists of integers
true_labels = np.array(true_labels)
predictions = np.array(predictions)

# Calculate Precision, Recall, and F1 Score
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Precision: 0.8750
Recall: 0.7778
F1 Score: 0.8235


Metrics Explanation
Precision: 0.8750

Precision measures the proportion of true positive results out of all positive predictions made by the model.

Interpretation: A Precision of 0.8750 means that when your model predicts a fact-check as relevant, it is correct 87.5% of the time. This indicates a high level of accuracy in the model's positive predictions.
Recall: 0.7778

Recall measures the proportion of true positive results out of all actual positive instances in the dataset.

Interpretation: A Recall of 0.7778 means that your model correctly identifies 77.78% of all relevant fact-checks in the dataset. This indicates that while the model is fairly good at finding relevant facts, there is still room for improvement in capturing all possible relevant cases.
F1 Score: 0.8235

F1 Score is the harmonic mean of Precision and Recall. It provides a single metric that balances the trade-off between Precision and Recall.

Interpretation: An F1 Score of 0.8235 indicates a strong balance between Precision and Recall. The model performs well in terms of both identifying relevant fact-checks and minimizing false positives.

Summary

Precision: Your model is very accurate when it identifies a fact-check as relevant, with a high proportion of correct positive predictions.

Recall: The model identifies a good portion of all relevant fact-checks, but there are still some relevant cases it might be missing.

F1 Score: The model achieves a good balance between Precision and Recall, reflecting overall strong performance.