CS 7643 Project

Georgia Institute of Technology

Author: Daniel Solon

# Preprocessing Twibot-20 Dataset using RoBERTa embeddings
Based on "A Deep Learning Approach for Robust Detection of Bots in Twitter Using Transformers" paper by Gutierrez et al. where their best model is based on (RoBERTa + metadata) for the input feature vectors fed to a Dense network.

## Import Libraries

In [None]:
import os
import time
import json
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

## Set Device
Set device to CUDA if available, else CPU

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## Generate Tweet Embeddings

Load RoBERTa tokenizer and model

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base").to(device)
roberta_model.eval()

def process_data(data_file_path, batch_size=64): 
    """
    :param data_file_path: json file path to be processed into embedding 
    :param batch_size: for tokenization; decrease if running into OOM errors
    :return: RoBERTa embeddings using CLS token; length=768
    """
    try:
        # Read JSON file
        with open(data_file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading JSON file: {e}")
        return None

    # Flatten JSON structure
    flattened_data = pd.json_normalize(data)

    # Extract tweets and ensure string type
    if "tweet" not in flattened_data:
        print("Error: 'tweet' column not found in data.")
        return None

    df_relevant = flattened_data[["tweet"]].explode("tweet")
    df_relevant["tweet"] = df_relevant["tweet"].astype(str)

    # Batch tokenize tweets
    tokenized_texts = tokenizer(
        df_relevant["tweet"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128, # increase if tweets are longer than 128 tokens
        return_tensors="pt"
    ).to(device)

    # Convert to tensors
    input_ids = tokenized_texts["input_ids"]
    attention_mask = tokenized_texts["attention_mask"]

    dataset = TensorDataset(input_ids, attention_mask)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=os.cpu_count())

    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing Embeddings"):
            input_ids_batch, attention_mask_batch = batch
            outputs = roberta_model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
            embeddings.append(outputs.last_hidden_state[:, 0, :].detach().cpu())  # Use CLS token

    # Stack embeddings into a single tensor
    tweet_roberta_embeddings = torch.cat(embeddings)

    return tweet_roberta_embeddings

Generate embeddings for the datasets

In [None]:
start_time = time.time()
test_tweet_emb = process_data("../Data/test.json", batch_size=64)
if test_tweet_emb is not None:
    end_time = time.time()
    print(f"Processing time: {end_time - start_time:.2f} seconds")
    print(test_tweet_emb.shape)

In [None]:
start_time = time.time()
train_tweet_emb = process_data('../Data/train.json', batch_size=64)
if test_tweet_emb is not None:
    end_time = time.time()
    print(f"Processing time: {end_time - start_time:.2f} seconds")
    print(train_tweet_emb.shape)

In [None]:
start_time = time.time()
validate_tweet_emb = process_data('../Data/dev.json', batch_size=64)
if test_tweet_emb is not None:
    end_time = time.time()
    print(f"Processing time: {end_time - start_time:.2f} seconds")
    print(validate_tweet_emb.shape)

## Save Tweet Embeddings

In [None]:
processed_data_path = r"../Data/Processed_Data"

if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path) 
    print(f"Directory created: {processed_data_path}")
else:
    print(f"Directory already exists: {processed_data_path}")

In [None]:
# Test
torch.save(test_tweet_emb, '../Data/Processed_Data/test_tweet_roberta_emb_tensor.pth')

# Train
torch.save(train_tweet_emb, '../Data/Processed_Data/train_tweet_roberta_emb_tensor.pth')

# Validate
torch.save(validate_tweet_emb, '../Data/Processed_Data/validate_tweet_roberta_emb_tensor.pth')