In [1]:
# Install required libraries
!pip install transformers datasets torch torchvision torchaudio

import torch
import pandas as pd
import re
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')



Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

FileNotFoundError: [Errno 2] No such file or directory: '/content//My Drive/Research code/processed_english.txt'

In [5]:

# Define correct paths for data files in Google Drive
base_path = "/content/drive/My Drive/Research code"
english_file = f"{base_path}/processed_english.txt"
kazakh_file = f"{base_path}/processed_kazakh.txt"

# Load processed English and Kazakh text files
with open(english_file, "r", encoding="utf-8") as f:
    english_text = f.read()

with open(kazakh_file, "r", encoding="utf-8") as f:
    kazakh_text = f.read()

# Function to extract tagged financial terms
def extract_terms(text, tag):
    pattern = f"<{tag}>(.*?)</{tag}>"
    return re.findall(pattern, text)

# Extract financial terms from both languages
english_financial_terms = extract_terms(english_text, "finance")
kazakh_financial_terms = extract_terms(kazakh_text, "қаржы")

# Prepare dataset for BERT
all_terms = english_financial_terms + kazakh_financial_terms
labels = [1] * len(english_financial_terms) + [0] * len(kazakh_financial_terms)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Define Dataset class
class FinancialDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create dataset
dataset = FinancialDataset(all_terms, labels, tokenizer)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, average='weighted'),
        'recall': recall_score(labels, predictions, average='weighted'),
        'f1': f1_score(labels, predictions, average='weighted')
    }

# Training arguments
training_args = TrainingArguments(
    output_dir=f"{base_path}/results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir=f"{base_path}/logs",
    report_to="none",  # Disable WandB logging
    push_to_hub=False  # Prevent uploading to Hugging Face Hub
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Save model to Google Drive
model_path = f"{base_path}/bert_financial_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved successfully to {model_path}!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.179308,0.947368,0.953216,0.947368,0.947663
2,No log,0.009847,1.0,1.0,1.0,1.0
3,No log,0.299357,0.947368,0.953216,0.947368,0.947663
4,No log,0.353521,0.947368,0.953216,0.947368,0.947663
5,No log,0.372479,0.947368,0.953216,0.947368,0.947663
6,No log,0.380957,0.947368,0.953216,0.947368,0.947663
7,No log,0.37802,0.947368,0.953216,0.947368,0.947663
8,No log,0.213178,0.947368,0.953216,0.947368,0.947663
9,No log,0.040933,0.947368,0.953216,0.947368,0.947663
10,No log,0.023463,1.0,1.0,1.0,1.0


Evaluation Results: {'eval_loss': 0.023462897166609764, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.1415, 'eval_samples_per_second': 134.244, 'eval_steps_per_second': 21.196, 'epoch': 10.0}
Model saved successfully to /content/drive/My Drive/Research code/bert_financial_model!
