In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, accuracy_score

Load the pre-trained BERT model and tokenizer:

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3)


Load and preprocess your labeled data:

In [None]:
# Load your labeled data
sentences = ['Chinese sentence 1', 'Chinese sentence 2', ...]
labels = [0, 1, ...]  # 0 for negative, 1 for neutral, 2 for positive

# Tokenize your sentences and convert them to input IDs


Tokenize your sentences and convert them to input IDs:

In [None]:
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


Convert your data to PyTorch tensors:

In [None]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


Split your data into training and validation sets:

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])


Create data loaders for your training and validation sets:

In [None]:
batch_size = 32

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )


Set up the optimizer and learning rate:

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # default learning rate
                  eps = 1e-8 # default epsilon value
                )

Train the model:

In [None]:
epochs = 4

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    # Calculate the average loss for this epoch
    avg_train_loss = train_loss / len(train_dataloader)
    
    # Evaluate the model on the validation set
    model.eval()
    val_loss = 0
    val_preds = []
    with torch.no_grad():
        for batch in validation_dataloader:
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs.loss
            val_loss += loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.tolist())
    # Calculate the average validation loss and f1 score for this epoch
    avg_val_loss = val_loss / len(validation_dataloader)
    val_f


In [None]:
# Save the fine-tuned model
model_dir = "/path/to/save/fine-tuned/model"
model.save_pretrained(model_dir)


Load the saved model:

In [None]:
# Load the saved model
model = BertForSequenceClassification.from_pretrained('/path/to/saved/model/')


Tokenize the new headlines and convert them to input IDs:

In [None]:
# Tokenize the new headlines and convert them to input IDs
new_headlines = ['New Chinese headline 1', 'New Chinese headline 2', ...]

input_ids = []
attention_masks = []

for sent in new_headlines:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


# Create a data loader for the new data
new_dataset = TensorDataset(input_ids, attention_masks)
new_dataloader = DataLoader(
            new_dataset,
            sampler = SequentialSampler(new_dataset),
            batch_size = batch_size
        )


Use the model to predict the sentiment of the new headlines:

In [None]:
# Use the model to predict the sentiment of the new headlines
model.eval()
new_preds = []
with torch.no_grad():
    for batch in new_dataloader:
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        new_preds.extend(preds.tolist())


In [None]:
# Decode the predicted labels
labels = ['Negative', 'Neutral', 'Positive']
predicted_sentiment = [labels[pred] for pred in new_preds]
print(predicted_sentiment)


In [None]:
# Use the model to predict the sentiment of the validation set
model.eval()
val_preds = []
val_probs = []
with torch.no_grad():
    for batch in val_dataloader:
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        preds = torch.argmax(logits, dim=1)
        val_probs.extend(probs.tolist())
        val_preds.extend(preds.tolist())


# Get the probability values for the positive class
positive_probs = [prob[2] for prob in val_probs]
