In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

from sklearn.model_selection import train_test_split
# Split the training data into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2)


# concatenate the data
train_data['text'] = train_data['title'] + ' ' + train_data['content'].fillna('')
test_data['text'] = test_data['title'] + ' ' + test_data['content'].fillna('')
val_data['text'] = val_data['title'] + ' ' + val_data['content'].fillna('')

# Preprocess the data
train_data['text'] = train_data['text'].replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")
test_data['text'] = test_data['text'].replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")
val_data['text'] = val_data['text'].replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")

In [3]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

# Ensure all elements in 'text' column are strings
train_data['text'] = train_data['text'].astype(str)
test_data['text'] = test_data['text'].astype(str)
val_data['text'] = val_data['text'].astype(str)

# Tokenize the data
train_encodings = tokenizer.batch_encode_plus(train_data['text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer.batch_encode_plus(test_data['text'].tolist(), truncation=True, padding=True, max_length=512)
val_data = tokenizer.batch_encode_plus(val_data['text'].tolist(), truncation=True, padding=True, max_length=512)

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def _init_(self, data):
        self.data = data

    def _len_(self):
        return len(self.data)

    def _getitem_(self, idx):
        return self.data[idx]

# Assuming train_data['text'] is a list of text samples
train_dataset = MyDataset(train_data['text'])

# Assuming test_encodings is a list of test data encodings
test_dataset = MyDataset(test_encodings)

# Assuming val_data['text'] is a list of validation text samples
val_dataset = MyDataset(val_data['text'])

In [1]:
# Initialize the model
model = BertForSequenceClassification.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1", num_labels=len(train_data['class'].unique()))

# Create the DataLoaders
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=16)
validation_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

# Setup the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

EPOCHS = 3

# Total number of training steps
total_steps = len(train_loader) * EPOCHS

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train the model
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

# save the model
model.save_pretrained('model')


NameError: name 'BertForSequenceClassification' is not defined

In [None]:
# Validate the model
model.eval()
for batch in validation_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]

In [None]:
# Predict the test data
model.eval()
predictions = []
for i, batch in enumerate(test_loader):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.tolist())

# Save to a new CSV file the id of the article and the predictions with an extra newline after each line
with open('predictions_bert_ro.csv', 'w') as f:
    f.write("id,class\n\n")
    for id, pred in zip(test_data['id'], predictions):
        f.write(f"{id},{pred}\n\n")
