<a href="https://colab.research.google.com/github/ZahraShourmeij/Roberta/blob/Transformers-articles/Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import json
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, RobertaTokenizer, RobertaForMaskedLM, AdamW
from torch.nn.utils.rnn import pad_sequence

# Load the GPT-2 tokenizer
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")

# Read the training dataset
try:
    df_train = pd.read_csv("EXIST2021_training.tsv", sep="\t")
except FileNotFoundError:
    print("Training dataset file not found.")
    exit()

# Extract labels from the "task1" column in the training dataset
labels_train = df_train["task1"].tolist()

# Tokenize and preprocess each text sample in the training dataset
preprocessed_texts_train = []
for text_train in df_train["text"]:
    # Tokenize the text
    tokens_train = tokenizer_gpt2.encode(text_train)
    # Extract token IDs from each token
    token_ids_train = [token_id_train for token_id_train in tokens_train]
    # Append preprocessed token IDs to the preprocessed texts list
    preprocessed_texts_train.append(token_ids_train)

# Save the preprocessed training dataset to a file
with open("preprocessed_train_dataset.json", "w") as file_train:
    json.dump(preprocessed_texts_train, file_train)

# Load RoBERTa tokenizer
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')

# Load preprocessed token IDs from the JSON file
try:
    with open("preprocessed_train_dataset.json", "r") as file_train:
        preprocessed_texts_train = json.load(file_train)
except FileNotFoundError:
    print("Preprocessed training dataset file not found.")
    exit()
#The Transformer-based architectures, such as BERT (Bidirectional Encoder Representations from Transformers) and RoBERTa (Robustly optimized BERT approach), are commonly trained using MLM as one of the pre-training tasks.
# Pad or truncate sequences to a fixed length
max_length = 64  # adjust as needed
input_ids_train = [tokens_train[:max_length] + [tokenizer_roberta.pad_token_id] * (max_length - len(tokens_train)) for tokens_train in preprocessed_texts_train]

# Convert token IDs into tensors
input_ids_train = torch.tensor(input_ids_train)

# Create a TensorDataset
dataset_train = TensorDataset(input_ids_train)

# Define batch size and create DataLoader for the training dataset
#DataLoader sets to shuffle the data before each epoch.
batch_size_train = 32
dataloader_train = DataLoader(dataset_train, batch_size=batch_size_train, shuffle=True)

# Load pre-trained RoBERTa model
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')

# Define optimizer
optimizer = AdamW(roberta_model.parameters(), lr=5e-5)

# Define training loop
num_epochs = 3
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
roberta_model.to(device)

for epoch in range(num_epochs):
    roberta_model.train()
    for batch_train in dataloader_train:
        # Move batch to device
        batch_train = tuple(t.to(device) for t in batch_train)
        inputs_train = {"input_ids": batch_train[0], "labels": batch_train[0]}  # Masked language modeling, predicting the same as input
        optimizer.zero_grad()
        outputs_train = roberta_model(**inputs_train)
        loss_train = outputs_train.loss
        loss_train.backward()
        optimizer.step()

# Save the trained model
roberta_model.save_pretrained("roberta-trained-model")




In [3]:
# Load the test dataset
try:
    test_df = pd.read_csv("EXIST2021_test_labeled.tsv", sep="\t")
except FileNotFoundError:
    print("Test dataset file not found.")
    exit()

# Tokenize and preprocess each text sample in the test dataset
preprocessed_test_texts = []
for text_test in test_df["text"]:
    tokens_test = tokenizer_roberta.encode(text_test, max_length=max_length, truncation=True)
    preprocessed_test_texts.append(tokens_test)

# Save the preprocessed test dataset to a file
with open("preprocessed_test_dataset.json", "w") as file_test:
    json.dump(preprocessed_test_texts, file_test)

# Define DataLoader for the test dataset
try:
    padded_input_ids_test = pad_sequence([torch.tensor(tokens_test[:max_length] + [tokenizer_roberta.pad_token_id] * (max_length - len(tokens_test))) for tokens_test in preprocessed_test_texts], batch_first=True)
except FileNotFoundError:
    print("Preprocessed test dataset file not found.")
    exit()

# Create a TensorDataset for the test dataset
test_dataset = TensorDataset(padded_input_ids_test)

# Define batch size for the test dataset
batch_size_test = 32

# Create DataLoader for the test dataset
test_dataloader = DataLoader(test_dataset, batch_size=batch_size_test)

# Set model to evaluation mode
roberta_model.eval()

# Initialize lists to store predicted labels and actual labels
predicted_labels = []
actual_labels = []
predictions = []  # Initialize the predictions list

# Iterate over batches in the test dataset
for batch_test in test_dataloader:
    batch_test = tuple(t.to(device) for t in batch_test)
    inputs_test = {"input_ids": batch_test[0]}
    with torch.no_grad():
        outputs_test = roberta_model(**inputs_test)
    logits_test = outputs_test.logits
    predicted_labels_test = torch.argmax(logits_test, dim=-1)
    predictions.extend(predicted_labels_test.cpu().numpy())
    # Add actual labels to the list
    actual_labels.extend(batch_test[0].cpu().numpy())


In [8]:
# Evaluate the model
correct_predictions = sum(1 for pred, label in zip(predictions, actual_labels) if (pred == label).all())
accuracy = correct_predictions / len(actual_labels)
print("Accuracy:", accuracy)


Accuracy: 0.011217948717948718
