<a href="https://colab.research.google.com/github/ZahraShourmeij/Roberta/blob/Transformers-articles/preprocess_data_for_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
pip install tokenizers pandas



In [23]:
!pip install transformers



In [24]:
import pandas as pd
import json
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Read your dataset
df = pd.read_csv("EXIST2021_training.tsv", sep="\t")
# Extract labels from the "task1" column in the test dataset
labels = df["task1"].tolist()

# Tokenize and preprocess each text sample
preprocessed_texts = []
for text in df["text"]:
    # Tokenize the text
    tokens = tokenizer.encode(text)
    # Extract token IDs from each token
    token_ids = [token_id for token_id in tokens]
    # Append preprocessed token IDs to the preprocessed texts list
    preprocessed_texts.append(token_ids)

    # Save the preprocessed texts
with open("preprocessed_dataset.json", "w") as file:
    json.dump(preprocessed_texts, file)


In [25]:
from transformers import RobertaTokenizer
from transformers import RobertaForMaskedLM
from transformers import AdamW

# Now you can use RobertaForMaskedLM class

import torch
from torch.utils.data import DataLoader, TensorDataset
import json

# Load RoBERTa tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load preprocessed token IDs from the JSON file
with open("preprocessed_dataset.json", "r") as file:
    preprocessed_texts = json.load(file)

# Pad or truncate sequences to a fixed length
max_length = 64  # adjust as needed
input_ids = [tokens[:max_length] + [roberta_tokenizer.pad_token_id] * (max_length - len(tokens)) for tokens in preprocessed_texts]

# Convert token IDs into tensors
input_ids = torch.tensor(input_ids)

# Create a TensorDataset
dataset = TensorDataset(input_ids)

# Define batch size and create DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load pre-trained RoBERTa model
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')

# Define optimizer
optimizer = AdamW(roberta_model.parameters(), lr=5e-5)

# Define training loop
num_epochs = 3
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
roberta_model.to(device)

for epoch in range(num_epochs):
    roberta_model.train()
    for batch in dataloader:
        # Move batch to device
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "labels": batch[0]}  # Masked language modeling, predicting the same as input
        optimizer.zero_grad()
        outputs = roberta_model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the trained model
roberta_model.save_pretrained("roberta-trained-model")




In [26]:
# Load the test dataset
test_df = pd.read_csv("EXIST2021_test_labeled.tsv", sep="\t")

# Extract labels from the "task1" column in the test dataset
labels = test_df["task1"].tolist()

# Tokenize and preprocess each text sample in the test dataset
preprocessed_test_texts = []
for text in test_df["text"]:
    tokens = roberta_tokenizer.encode(text, max_length=max_length, truncation=True)
    preprocessed_test_texts.append(tokens)

from torch.nn.utils.rnn import pad_sequence

# Define the maximum sequence length
max_length = 64  # adjust as needed

# Pad or truncate sequences to the maximum length
padded_input_ids_test = pad_sequence([torch.tensor(tokens[:max_length] + [roberta_tokenizer.pad_token_id] * (max_length - len(tokens))) for tokens in preprocessed_test_texts], batch_first=True)

# Create a TensorDataset
test_dataset = TensorDataset(padded_input_ids_test)

# Define batch size and create DataLoader for the test dataset
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Continue with your inference and evaluation code...

# Set model to evaluation mode
roberta_model.eval()

# Lists to store predictions and labels
predictions = []
labels = []

# Iterate over batches in the test dataset
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {"input_ids": batch[0]}
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=-1)
    predictions.extend(predicted_labels.cpu().numpy())
    # Add actual labels to the list
    labels.extend(batch[0].cpu().numpy())

# Convert token IDs back to text for evaluation (if needed)
# Perform any additional processing or evaluation steps here...

import numpy as np

# Example evaluation: calculate accuracy
correct_predictions = sum(1 for pred, label in zip(predictions, labels) if np.array_equal(pred, label))
accuracy = correct_predictions / len(labels)
print("Accuracy:", accuracy)



Accuracy: 0.0
