<a href="https://colab.research.google.com/github/ayushm262003/FinalProj/blob/main/FineTuneMarBho.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [None]:
from google.colab import files
uploaded = files.upload()


Saving probeless_ranking.pkl to probeless_ranking.pkl
Saving mr_ufal-um-train.conllu to mr_ufal-um-train.conllu
Saving linear by ttb probeless to linear by ttb probeless
Saving linear by testing 1001-2000ttb probeless to linear by testing 1001-2000ttb probeless
Saving bho_bhtb-um-test.conllu to bho_bhtb-um-test.conllu


In [None]:
import os
print(os.listdir("/content"))


['.config', 'mr_ufal-um-train.conllu', 'linear by testing 1001-2000ttb probeless', 'linear by ttb probeless', 'probeless_ranking.pkl', 'bho_bhtb-um-test.conllu', 'sample_data']


In [None]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from conllu import parse
import numpy as np

# Function to read .conllu files and extract (word, pos) pairs
def read_conllu(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    parsed_data = parse(data)

    sentences = []
    for sentence in parsed_data:
        word_tag_pairs = [(token["form"], token["upos"]) for token in sentence if token["form"] is not None]
        if word_tag_pairs:  # Only add non-empty sentences
            sentences.append(word_tag_pairs)
    return sentences

# Load Marathi training data and Bhojpuri test data
marathi_train_data = read_conllu("/content/mr_ufal-um-train.conllu")
bhojpuri_test_data = read_conllu("/content/bho_bhtb-um-test.conllu")

# Extract sentences and POS tags
marathi_sentences, marathi_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in marathi_train_data])
bhojpuri_sentences, bhojpuri_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in bhojpuri_test_data])

# Create mapping of POS tags to IDs
unique_tags = list(set(tag for sent in marathi_labels for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

# Load BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(unique_tags))

# Function to encode tags with proper subword handling
def encode_tags(tags, word_ids, tag2id):
    """
    Assign label to the first token of each word, and use -100 for subword tokens or special tokens.
    """
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id[tag] if tag in tag2id else -100)
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

# Define Dataset class
class POSDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, tag2id):
        self.encodings = tokenizer(list(sentences), is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")
        self.labels = []
        for i, label in enumerate(labels):
            word_ids = self.encodings.word_ids(batch_index=i)
            self.labels.append(encode_tags(label, word_ids, tag2id))
        self.labels = torch.tensor(self.labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = self.labels[idx]
        return item

# Prepare datasets
train_dataset = POSDataset(marathi_sentences, marathi_labels, tokenizer, tag2id)
test_dataset = POSDataset(bhojpuri_sentences, bhojpuri_labels, tokenizer, tag2id)

# Define training parameters (WandB logging disabled via report_to=[])
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",
    report_to=[]  # Disable WandB logging
)

# Define Trainer with evaluation dataset provided
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate on Bhojpuri test data
model.eval()
test_loader = DataLoader(test_dataset, batch_size=8)

predicted_labels = []
true_labels = []

for batch in test_loader:
    with torch.no_grad():
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()

    # Iterate sentence by sentence in the batch
    for pred_sentence, true_sentence in zip(predictions, batch["labels"].tolist()):
        # Iterate token by token in the sentence
        for pred_token, true_token in zip(pred_sentence, true_sentence):
            if true_token != -100:
                predicted_labels.append(id2tag[pred_token])
                true_labels.append(id2tag[true_token])

print(classification_report(true_labels, predicted_labels, zero_division=0))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,2.3906,2.438685


Epoch,Training Loss,Validation Loss
1,2.3906,2.438685
2,1.3605,2.008405
3,0.807,1.81622


              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00        47
         ADP       0.85      0.39      0.53       147
         ADV       0.04      0.67      0.08         3
         AUX       0.00      0.00      0.00        34
       CCONJ       0.00      0.00      0.00        21
         DET       0.33      0.02      0.04        47
        NOUN       0.46      0.85      0.60       286
         NUM       0.00      0.00      0.00        33
        PART       0.00      0.00      0.00        16
        PRON       0.27      0.11      0.15        37
       PROPN       0.00      0.00      0.00       110
       PUNCT       0.88      0.93      0.91       121
       SCONJ       0.00      0.00      0.00        16
        VERB       0.39      0.81      0.53        95
           _       0.00      0.00      0.00         0

    accuracy                           0.49      1013
   macro avg       0.22      0.25      0.19      1013
weighted avg       0.42   

In [None]:
# Save the model and tokenizer to a directory
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, Trainer, TrainingArguments
from pathlib import Path
from load_subset import load_top_neurons
import consts
from dataHandler import UMDataHandler
from argparse import ArgumentParser

# Custom BERT Model Using Selected Neurons
class SubsetBERT(nn.Module):
    def _init_(self, bert_model, selected_neurons, num_labels):
        super(SubsetBERT, self)._init_()
        self.bert = AutoModel.from_pretrained(bert_model)
        self.selected_neurons = selected_neurons  # Dictionary {layer: neuron_indices}
        self.classifier = nn.Linear(sum(len(v) for v in selected_neurons.values()), num_labels)

    def forward(self, input_embeds=None, attention_mask=None, token_type_ids=None, labels=None):
        # Ensure input_embeds has correct shape [batch_size, seq_length, hidden_dim]
        if input_embeds is not None and input_embeds.dim() == 1:
            input_embeds = input_embeds.unsqueeze(0).unsqueeze(0)  # Convert [768] → [1, 1, 768]

        outputs = self.bert(inputs_embeds=input_embeds, output_hidden_states=True)
        hidden_states = outputs.hidden_states

        # Select specific neurons while keeping sequence length
        selected_features = []
        for layer, indices in self.selected_neurons.items():
            selected_features.append(hidden_states[layer][:, :, indices])

        selected_features = torch.cat(selected_features, dim=-1)  # Concatenate along last dim
        selected_features = selected_features.mean(dim=1)  # Reduce sequence dimension if needed
        logits = self.classifier(selected_features)

        if labels is not None:
            labels = labels.view(-1)  # Ensure labels have shape [batch_size]
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            return loss, logits  # Ensure Trainer API gets loss

        return logits

# Fine-tuning function with Trainer API
def fine_tune_with_trainer(train_path, model_type, language, attribute, layer, subset_size=1000, epochs=1, save_path=""):
    print("\n🔹 Loading Dataset...")
    data_handler = UMDataHandler(train_path, 'UM', model_type, layer, language=language, attribute=attribute)
    data_handler.create_dicts()
    dataloader = data_handler.get_dataloader(batch_size=32)
    dataset = data_handler.get_dataset()
    print(f"✅ Dataset Loaded: {len(dataset)} samples")

    print("\n🔹 Loading Selected Neurons...")
    model_name = consts.model_names[model_type]
    num_labels = len(set([sample[1] for sample in dataloader.dataset]))
    selected_neurons = load_top_neurons(Path('pickles', 'UM', model_type, language, attribute, 'probeless_ranking.pkl'), subset_size)
    print(f"✅ Selected {len(selected_neurons)} layers from {model_name}")

    print("\n🔹 Initializing Model...")
    model = SubsetBERT(model_name, selected_neurons, num_labels)

    # Print batch shapes for debugging
    for batch in dataset:
        for key, value in batch.items():
            print(f"{key}: {value.shape}")
        break

    print("\n🔹 Setting Training Arguments...")
    training_args = TrainingArguments(
        output_dir=str(save_path),
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=epochs,
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=100,
        save_total_limit=1,
    )

    print("\n🔹 Initializing Trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    print("\n🚀 Starting Fine-Tuning...")
    trainer.train()
    print("✅ Fine-Tuning Complete!")

    if save_path:
        model_filename = save_path / "best_model.pth"
        torch.save(model.state_dict(), model_filename)
        print(f"📌 Best model saved to {model_filename}")
    else:
        print("📌 No save path provided")

    print("\n🏁 Training Finished!")
    return model

# Main function
if _name_ == "_main_":
    parser = ArgumentParser()
    parser.add_argument('-model', type=str, required=True)
    parser.add_argument('-language', type=str, required=True)
    parser.add_argument('-attribute', type=str, required=True)
    parser.add_argument('-layer', type=int, required=True)
    parser.add_argument('--control', default=False, action='store_true')
    args = parser.parse_args()

    print("\n🔹 Parsing Arguments...")
    model_type = args.model
    language = args.language
    attribute = args.attribute
    layer = args.layer
    control = args.control
    control_str = '_control' if control else ''

    save_path = Path('pickles', 'UM', model_type, language, attribute)
    train_path = Path('pickles', 'UM', model_type, language, 'train_parsed.pkl')

    if not save_path.exists():
        save_path.mkdir(parents=True, exist_ok=True)
    print(f"✅ Save Path: {save_path}")
    print(f"✅ Training Data Path: {train_path}")

    # Fine-tune with Trainer API
    fine_tune_with_trainer(train_path, model_type, language, attribute, layer, epochs=1, save_path=save_path)


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


In [None]:
# Install required packages (if not already installed)
!pip install transformers conllu scikit-learn

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
import pickle
from conllu import parse_incr  # For parsing CoNLL-U files
from pathlib import Path
from sklearn.metrics import classification_report
import numpy as np

##############################################
# 1. Load the top 50 neuron indices from probeless_ranking.pkl
##############################################
pkl_path = "/content/probeless_ranking.pkl"  # Ensure this file is in your working directory
with open(pkl_path, "rb") as f:
    ranking = pickle.load(f)

# Select the top 50 indices (starting from the first value)
top_50_indices = ranking[:50]
print("Top 50 global neuron indices:", top_50_indices)

# Convert global indices to a dictionary mapping layer -> list of neuron indices.
# Total neurons: 9984 = 13 layers * 768. In Hugging Face, hidden_states has 13 elements
# (0: embeddings, 1-12: transformer layers).
hidden_dim = 768
selected_neurons = {}
for idx in top_50_indices:
    # Map the global index to a layer.
    # Here layer 0 corresponds to the embedding output.
    layer = idx // hidden_dim
    neuron_idx = idx % hidden_dim
    if layer not in selected_neurons:
        selected_neurons[layer] = []
    selected_neurons[layer].append(neuron_idx)

print("Selected neurons by layer:", selected_neurons)

##############################################
# 2. Define functions to read CoNLL-U files and build the POS tag datasets
##############################################
def read_conllu(file_path):
    """Reads a CoNLL-U file and returns a list of (words, pos_tags) for each sentence."""
    sentences = []
    with open(file_path, "r", encoding="utf-8") as data_file:
        for tokenlist in parse_incr(data_file):
            words = []
            pos_tags = []
            for token in tokenlist:
                # Only consider tokens with a non-None form.
                if token["form"] is not None:
                    words.append(token["form"])
                    # Use UPOS tag from the token; if missing, you can assign a default label.
                    pos_tags.append(token.get("upos", "X"))
            if words:
                sentences.append((words, pos_tags))
    return sentences

# Read training and test data (assumes POS tags are in the "upos" field)
train_data = read_conllu("/content/mr_ufal-um-train.conllu")
test_data  = read_conllu("/content/bho_bhtb-um-test.conllu")

# Separate words and tags
train_sentences, train_tags = zip(*train_data)
test_sentences, test_tags   = zip(*test_data)

# Build tag mappings from training data
unique_tags = list(set(tag for sent in train_tags for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

print("Unique tags:", unique_tags)
print("Number of labels:", len(unique_tags))

##############################################
# 3. Define a function to encode tags for token classification
##############################################
def encode_tags(tags, word_ids, tag2id):
    """
    Assign label to the first token of each word, and -100 to subword or special tokens.
    """
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id[tag])
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

##############################################
# 4. Define a Dataset class for token classification using CoNLL-U data
##############################################
class TokenClassificationDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, tag2id, max_length=128):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encodings = tokenizer(list(sentences), is_split_into_words=True,
                                   padding=True, truncation=True, max_length=max_length,
                                   return_tensors="pt")
        all_labels = []
        for i, tag_seq in enumerate(tags):
            word_ids = self.encodings.word_ids(batch_index=i)
            encoded_tags = encode_tags(tag_seq, word_ids, tag2id)
            all_labels.append(encoded_tags)
        # Pad labels manually if needed (tokenizer already padded input_ids)
        self.labels = all_labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

##############################################
# 5. Define the custom SubsetBERT model for token classification
##############################################
class SubsetBERT(nn.Module):
    def __init__(self, bert_model_name, selected_neurons, num_labels):
        super(SubsetBERT, self).__init__()
        # Load the pre-trained BERT model and request hidden states.
        # hidden_states will be a tuple of length 13 (0: embeddings, 1-12: transformer layers)
        self.bert = AutoModel.from_pretrained(bert_model_name, output_hidden_states=True)
        self.selected_neurons = selected_neurons  # Dictionary: {layer: [indices]}
        # The input dimension for each token is the sum of selected neurons across layers.
        input_dim = sum(len(indices) for indices in selected_neurons.values())
        # For token classification, classifier outputs [batch, seq_len, num_labels]
        self.classifier = nn.Linear(input_dim, num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        hidden_states = outputs.hidden_states
        selected_features = []
        for layer, indices in self.selected_neurons.items():
            # hidden_states[layer] has shape: [batch_size, seq_length, hidden_dim]
            layer_features = hidden_states[layer][:, :, indices]
            selected_features.append(layer_features)
        # Concatenate along the hidden dimension: shape [batch, seq_len, total_selected_neurons]
        selected_features = torch.cat(selected_features, dim=-1)
        logits = self.classifier(selected_features)  # [batch, seq_len, num_labels]

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            # Flatten the tokens
            loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
            return loss, logits
        return logits

##############################################
# 6. Prepare the tokenizer, datasets, and model
##############################################
bert_model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

train_dataset = TokenClassificationDataset(train_sentences, train_tags, tokenizer, tag2id, max_length=128)
test_dataset  = TokenClassificationDataset(test_sentences, test_tags, tokenizer, tag2id, max_length=128)

num_labels = len(unique_tags)
print("Number of labels for token classification:", num_labels)

# Initialize the custom model for token classification
model = SubsetBERT(bert_model_name, selected_neurons, num_labels)
print("Custom SubsetBERT model (for token classification) initialized.")

##############################################
# 7. Set up the Trainer API and start training
##############################################
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to=[],  # Disable WandB logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print("Starting training ...")
trainer.train()
print("Training completed.")

##############################################
# 8. Save the fine-tuned model
##############################################
model_save_path = Path("./fine_tuned_subsetbert")
model_save_path.mkdir(exist_ok=True)
torch.save(model.state_dict(), model_save_path / "best_model.pth")
print(f"Model saved to {model_save_path / 'best_model.pth'}")

##############################################
# 9. Evaluate on the test set and print classification report
##############################################
# Run predictions on the test set
predictions_output = trainer.predict(test_dataset)
# predictions_output.predictions shape: [num_samples, seq_len, num_labels]
pred_logits = predictions_output.predictions
true_labels = predictions_output.label_ids

# Convert logits to predicted labels (for each token)
pred_labels = np.argmax(pred_logits, axis=-1)

# Gather predictions and true labels (ignoring label -100)
all_pred_tags = []
all_true_tags = []
for i in range(len(true_labels)):
    for j in range(len(true_labels[i])):
        if true_labels[i][j] != -100:
            all_true_tags.append(id2tag[true_labels[i][j]])
            all_pred_tags.append(id2tag[pred_labels[i][j]])

# Print classification report
report = classification_report(all_true_tags, all_pred_tags, zero_division=0)
print("\nClassification Report:\n")
print(report)


Top 50 global neuron indices: [8675, 7907, 6371, 7139, 5603, 4821, 8426, 9194, 3108, 5589, 2499, 7658, 5867, 4860, 6890, 4092, 5628, 8356, 3324, 7588, 3961, 6052, 6820, 1471, 3285, 3337, 3734, 1457, 6635, 2326, 9124, 804, 1166, 398, 5412, 5171, 2569, 7809, 2340, 3193, 3215, 6396, 2556, 4729, 3748, 1572, 1558, 5186, 6875, 3173]
Selected neurons by layer: {11: [227, 746, 676], 10: [227, 746, 676, 129], 8: [227, 746, 676, 491, 252, 731], 9: [227, 746, 676], 7: [227, 213, 491, 252, 676, 36], 6: [213, 252, 563, 121, 578], 4: [36, 252, 213, 265, 662, 121, 143, 676, 101], 3: [195, 22, 265, 36, 252], 5: [252, 121], 1: [703, 689, 36, 398], 0: [398], 2: [36, 22]}
Unique tags: ['PUNCT', 'INTJ', 'DET', 'CCONJ', 'SCONJ', 'AUX', 'X', 'PRON', 'ADV', 'NOUN', 'VERB', 'PART', 'NUM', 'PROPN', 'ADJ', 'ADP', '_']
Number of labels: 17
Number of labels for token classification: 17
Custom SubsetBERT model (for token classification) initialized.
Starting training ...




Epoch,Training Loss,Validation Loss
1,No log,1.859638
2,1.442700,1.983034
3,0.515400,2.140307
4,0.275400,2.057409
5,0.180600,2.134906


Training completed.
Model saved to fine_tuned_subsetbert/best_model.pth



Classification Report:

              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00        45
         ADP       0.71      0.32      0.44       146
         ADV       0.00      0.00      0.00         3
         AUX       0.00      0.00      0.00        34
       CCONJ       0.00      0.00      0.00        21
         DET       0.09      0.02      0.03        47
        NOUN       0.50      0.82      0.62       284
         NUM       0.00      0.00      0.00        33
        PART       0.00      0.00      0.00        16
        PRON       0.22      0.11      0.15        37
       PROPN       0.06      0.01      0.02        99
       PUNCT       0.88      0.85      0.87       115
       SCONJ       0.00      0.00      0.00        16
        VERB       0.47      0.64      0.54        94
           _       0.00      0.00      0.00         0

    accuracy                           0.45       990
   macro avg       0.20      0.19      0.18       990
w

In [None]:
from google.colab import files
uploaded = files.upload()  # Select your probeless_ranking.pkl file when prompted.



KeyboardInterrupt: 

In [None]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from conllu import parse
import numpy as np

# Function to read .conllu files and extract (word, pos) pairs
def read_conllu(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    parsed_data = parse(data)

    sentences = []
    for sentence in parsed_data:
        word_tag_pairs = [(token["form"], token["upos"]) for token in sentence if token["form"] is not None]
        if word_tag_pairs:  # Only add non-empty sentences
            sentences.append(word_tag_pairs)
    return sentences

# Load Marathi training data and Bhojpuri test data
marathi_train_data = read_conllu("/content/mr_ufal-um-train.conllu")
bhojpuri_test_data = read_conllu("/content/bho_bhtb-um-test.conllu")

# Extract sentences and POS tags
marathi_sentences, marathi_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in marathi_train_data])
bhojpuri_sentences, bhojpuri_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in bhojpuri_test_data])

# Create mapping of POS tags to IDs
unique_tags = list(set(tag for sent in marathi_labels for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

# Load BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(unique_tags))

# 🔄 Freeze all mBERT parameters (only classifier head will be trained)
for param in model.bert.parameters():
    param.requires_grad = False

# Function to encode tags with proper subword handling
def encode_tags(tags, word_ids, tag2id):
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id[tag] if tag in tag2id else -100)
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

# Define Dataset class
class POSDataset(Dataset):
    def _init_(self, sentences, labels, tokenizer, tag2id):
        self.encodings = tokenizer(list(sentences), is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")
        self.labels = []
        for i, label in enumerate(labels):
            word_ids = self.encodings.word_ids(batch_index=i)
            self.labels.append(encode_tags(label, word_ids, tag2id))
        self.labels = torch.tensor(self.labels)

    def _len_(self):
        return len(self.labels)

    def _getitem_(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = self.labels[idx]
        return item

# Prepare datasets
train_dataset = POSDataset(marathi_sentences, marathi_labels, tokenizer, tag2id)
test_dataset = POSDataset(bhojpuri_sentences, bhojpuri_labels, tokenizer, tag2id)

# Define training parameters
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to=[]  # Disable WandB logging
)

# Define Trainer with evaluation dataset provided
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model (only classifier head will be trained)
trainer.train()

# Evaluate on Bhojpuri test data
model.eval()
test_loader = DataLoader(test_dataset, batch_size=8)

predicted_labels = []
true_labels = []

for batch in test_loader:
    with torch.no_grad():
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()

    for pred_sentence, true_sentence in zip(predictions, batch["labels"].tolist()):
        for pred_token, true_token in zip(pred_sentence, true_sentence):
            if true_token != -100:
                predicted_labels.append(id2tag[pred_token])
                true_labels.append(id2tag[true_token])

print(classification_report(true_labels, predicted_labels, zero_division=0))

ModuleNotFoundError: No module named 'conllu'

In [None]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from conllu import parse
import numpy as np

# Function to read .conllu files and extract (word, pos) pairs
def read_conllu(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    parsed_data = parse(data)

    sentences = []
    for sentence in parsed_data:
        word_tag_pairs = [(token["form"], token["upos"]) for token in sentence if token["form"] is not None]
        if word_tag_pairs:  # Only add non-empty sentences
            sentences.append(word_tag_pairs)
    return sentences

# Load Marathi training data and Bhojpuri test data
marathi_train_data = read_conllu("/content/mr_ufal-um-train.conllu")
bhojpuri_test_data = read_conllu("/content/bho_bhtb-um-test.conllu")

# Extract sentences and POS tags
marathi_sentences, marathi_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in marathi_train_data])
bhojpuri_sentences, bhojpuri_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in bhojpuri_test_data])

# Create mapping of POS tags to IDs
unique_tags = list(set(tag for sent in marathi_labels for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

# Load BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(unique_tags))

# 🔄 Freeze all mBERT parameters (only classifier head will be trained)
for param in model.bert.parameters():
    param.requires_grad = False

# Function to encode tags with proper subword handling
def encode_tags(tags, word_ids, tag2id):
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id.get(tag, -100))
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

# Define Dataset class
class POSDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, tag2id):
        self.encodings = tokenizer(list(sentences), is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")
        self.labels = []
        for i, label in enumerate(labels):
            word_ids = self.encodings.word_ids(batch_index=i)
            self.labels.append(encode_tags(label, word_ids, tag2id))
        self.labels = torch.tensor(self.labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = self.labels[idx]
        return item

# Prepare datasets
train_dataset = POSDataset(marathi_sentences, marathi_labels, tokenizer, tag2id)
test_dataset = POSDataset(bhojpuri_sentences, bhojpuri_labels, tokenizer, tag2id)

# Define training parameters
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to=[]  # Disable WandB logging
)

# Define Trainer with evaluation dataset provided
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model (only classifier head will be trained)
trainer.train()

# Evaluate on Bhojpuri test data
model.eval()
test_loader = DataLoader(test_dataset, batch_size=8)

predicted_labels = []
true_labels = []

for batch in test_loader:
    with torch.no_grad():
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()

    for pred_sentence, true_sentence in zip(predictions, batch["labels"].tolist()):
        for pred_token, true_token in zip(pred_sentence, true_sentence):
            if true_token != -100:
                predicted_labels.append(id2tag[pred_token])
                true_labels.append(id2tag[true_token])

# Print classification report
print(classification_report(true_labels, predicted_labels, zero_division=0))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.854208
2,2.889200,2.85376
3,2.889200,2.852998
4,2.889500,2.851912
5,2.889800,2.850527
6,2.889800,2.848835
7,2.879100,2.846819
8,2.879100,2.844498
9,2.873700,2.841886
10,2.869900,2.838978


Epoch,Training Loss,Validation Loss
1,No log,2.854208
2,2.889200,2.85376
3,2.889200,2.852998
4,2.889500,2.851912
5,2.889800,2.850527
6,2.889800,2.848835
7,2.879100,2.846819
8,2.879100,2.844498
9,2.873700,2.841886
10,2.869900,2.838978


              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00        47
         ADP       0.20      0.19      0.20       147
         ADV       0.00      0.00      0.00         3
         AUX       0.04      0.03      0.04        34
       CCONJ       0.06      0.33      0.10        21
         DET       0.00      0.00      0.00        47
        INTJ       0.00      0.00      0.00         0
        NOUN       0.29      0.03      0.06       286
         NUM       0.00      0.00      0.00        33
        PART       0.00      0.00      0.00        16
        PRON       0.05      0.19      0.08        37
       PROPN       0.21      0.15      0.18       110
       PUNCT       0.21      0.05      0.08       121
       SCONJ       0.01      0.06      0.02        16
        VERB       0.07      0.09      0.08        95
           X       0.00      0.00      0.00         0
           _       0.00      0.00      0.00         0

    accuracy              

In [None]:
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from collections import Counter, defaultdict
from conllu import parse
from imblearn.over_sampling import RandomOverSampler

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

# Function to read .conllu files and extract (word, pos) pairs
def read_conllu(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    parsed_data = parse(data)

    sentences = []
    for sentence in parsed_data:
        word_tag_pairs = [(token["form"], token["upos"]) for token in sentence if token["form"] is not None]
        if word_tag_pairs:
            sentences.append(word_tag_pairs)
    return sentences

# Load training (Marathi) and test (Bhojpuri) data
marathi_train_data = read_conllu("/content/mr_ufal-um-train.conllu")
bhojpuri_test_data = read_conllu("/content/bho_bhtb-um-test.conllu")

# Extract sentences and POS tags
marathi_sentences, marathi_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in marathi_train_data])
bhojpuri_sentences, bhojpuri_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in bhojpuri_test_data])

# Create mapping of POS tags to IDs
unique_tags = list(set(tag for sent in marathi_labels for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

# 🔥 Apply Oversampling (Word-Level) to Balance Marathi Training Data
word_tag_pairs = [(word, tag) for sent, tags in zip(marathi_sentences, marathi_labels) for word, tag in zip(sent, tags)]
words, tags = zip(*word_tag_pairs)
words = np.array(words).reshape(-1, 1)
tags = np.array(tags)

sampling_strategy = {tag: max(50, count) for tag, count in Counter(tags).items()}
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
words_resampled, tags_resampled = ros.fit_resample(words, tags)

# Reconstruct sentences after oversampling
tagged_sentences = defaultdict(list)
for word, tag in zip(words_resampled.flatten(), tags_resampled):
    tagged_sentences[tag].append(word)

marathi_sentences_resampled = []
marathi_labels_resampled = []
for tag, word_list in tagged_sentences.items():
    marathi_sentences_resampled.append(word_list)
    marathi_labels_resampled.append([tag] * len(word_list))

# Function to encode tags with subword tokenization handling
def encode_tags(tags, word_ids, tag2id):
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id.get(tag, -100))
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

# Define POSDataset class
class POSDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, tag2id):
        self.encodings = tokenizer(list(sentences), is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")
        self.labels = []
        for i, label in enumerate(labels):
            word_ids = self.encodings.word_ids(batch_index=i)
            self.labels.append(encode_tags(label, word_ids, tag2id))
        self.labels = torch.tensor(self.labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = self.labels[idx]
        return item

# Prepare training and testing datasets
train_dataset = POSDataset(marathi_sentences_resampled, marathi_labels_resampled, tokenizer, tag2id)
test_dataset = POSDataset(bhojpuri_sentences, bhojpuri_labels, tokenizer, tag2id)

# Load BERT model with a classification head
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(unique_tags))

# 🔄 Freeze all BERT parameters except classifier head
for param in model.bert.parameters():
    param.requires_grad = False

# 🔓 Unfreeze the last 6 layers for fine-tuning
for layer in model.bert.encoder.layer[-6:]:  # Last 6 layers
    for param in layer.parameters():
        param.requires_grad = True

# Training arguments with learning rate scheduling
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,  # Increased for better generalization
    per_device_train_batch_size=8,  # Increased batch size
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  # Adjusted for stability
    report_to=[]  # Disable WandB logging
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate model on Bhojpuri test set
model.eval()
test_loader = DataLoader(test_dataset, batch_size=8)

predicted_labels = []
true_labels = []

for batch in test_loader:
    with torch.no_grad():
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()

    for pred_sentence, true_sentence in zip(predictions, batch["labels"].tolist()):
        for pred_token, true_token in zip(pred_sentence, true_sentence):
            if true_token != -100:
                predicted_labels.append(id2tag[pred_token])
                true_labels.append(id2tag[true_token])

# Print final classification report
print("\n🔎 **Evaluation Results on Bhojpuri Test Set:**")
print(classification_report(true_labels, predicted_labels, zero_division=0))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.782729
2,No log,2.782291
3,No log,2.781909
4,2.880600,2.78171
5,2.880600,2.7814
6,2.880600,2.780714
7,2.767600,2.779681
8,2.767600,2.778185
9,2.767600,2.776513
10,2.705000,2.774205


Epoch,Training Loss,Validation Loss
1,No log,2.782729
2,No log,2.782291
3,No log,2.781909
4,2.880600,2.78171
5,2.880600,2.7814
6,2.880600,2.780714
7,2.767600,2.779681
8,2.767600,2.778185
9,2.767600,2.776513
10,2.705000,2.774205



🔎 **Evaluation Results on Bhojpuri Test Set:**
              precision    recall  f1-score   support

         ADJ       0.25      0.04      0.07        47
         ADP       0.09      0.01      0.01       147
         ADV       0.00      0.00      0.00         3
         AUX       0.43      0.09      0.15        34
       CCONJ       0.00      0.00      0.00        21
         DET       0.00      0.00      0.00        47
        INTJ       0.00      0.00      0.00         0
        NOUN       0.25      0.48      0.33       286
         NUM       0.00      0.00      0.00        33
        PART       0.00      0.00      0.00        16
        PRON       0.00      0.00      0.00        37
       PROPN       0.00      0.00      0.00       110
       PUNCT       0.50      0.01      0.02       121
       SCONJ       0.00      0.00      0.00        16
        VERB       0.23      0.26      0.24        95
           X       0.00      0.00      0.00         0
           _       0.00      0.00

In [None]:
from google.colab import files
uploaded = files.upload()


Saving probeless_ranking.pkl to probeless_ranking.pkl
Saving mr_ufal-um-train.conllu to mr_ufal-um-train.conllu
Saving linear by ttb probeless to linear by ttb probeless
Saving linear by testing 1001-2000ttb probeless to linear by testing 1001-2000ttb probeless
Saving bho_bhtb-um-test.conllu to bho_bhtb-um-test.conllu


In [None]:
# Install required packages (if not already installed)
!pip install transformers conllu scikit-learn

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
import pickle
from conllu import parse_incr  # For parsing CoNLL-U files
from pathlib import Path
from sklearn.metrics import classification_report
import numpy as np

##############################################
# 1. Load the top 50 neuron indices from probeless_ranking.pkl
##############################################
pkl_path = "/content/probeless_ranking.pkl"  # Ensure this file is in your working directory
with open(pkl_path, "rb") as f:
    ranking = pickle.load(f)

# Select the top 50 indices (starting from the first value)
top_50_indices = ranking[:50]

# Convert global indices to a dictionary mapping layer -> list of neuron indices.
hidden_dim = 768
selected_neurons = {}
for idx in top_50_indices:
    layer = idx // hidden_dim
    neuron_idx = idx % hidden_dim
    if layer not in selected_neurons:
        selected_neurons[layer] = []
    selected_neurons[layer].append(neuron_idx)

##############################################
# 2. Define functions to read CoNLL-U files and build the POS tag datasets
##############################################
def read_conllu(file_path):
    """Reads a CoNLL-U file and returns a list of (words, pos_tags) for each sentence."""
    sentences = []
    with open(file_path, "r", encoding="utf-8") as data_file:
        for tokenlist in parse_incr(data_file):
            words = []
            pos_tags = []
            for token in tokenlist:
                if token["form"] is not None:
                    words.append(token["form"])
                    pos_tags.append(token.get("upos", "X"))
            if words:
                sentences.append((words, pos_tags))
    return sentences

# Read training and test data
train_data = read_conllu("/content/mr_ufal-um-train.conllu")
test_data  = read_conllu("/content/bho_bhtb-um-test.conllu")

# Separate words and tags
train_sentences, train_tags = zip(*train_data)
test_sentences, test_tags   = zip(*test_data)

# Build tag mappings from training data
unique_tags = list(set(tag for sent in train_tags for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

##############################################
# 3. Define a function to encode tags for token classification
##############################################
def encode_tags(tags, word_ids, tag2id):
    """Assign label to the first token of each word, and -100 to subword or special tokens."""
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id[tag])
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

##############################################
# 4. Define a Dataset class for token classification
##############################################
class TokenClassificationDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, tag2id, max_length=128):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encodings = tokenizer(list(sentences), is_split_into_words=True,
                                   padding=True, truncation=True, max_length=max_length,
                                   return_tensors="pt")
        all_labels = []
        for i, tag_seq in enumerate(tags):
            word_ids = self.encodings.word_ids(batch_index=i)
            encoded_tags = encode_tags(tag_seq, word_ids, tag2id)
            all_labels.append(encoded_tags)
        self.labels = all_labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

##############################################
# 5. Define the custom SubsetBERT model for token classification
##############################################
class SubsetBERT(nn.Module):
    def __init__(self, bert_model_name, selected_neurons, num_labels):
        super(SubsetBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name, output_hidden_states=True)
        self.selected_neurons = selected_neurons

        # Freeze all BERT parameters
        for param in self.bert.parameters():
            param.requires_grad = False

        # Compute input dimension from selected neurons
        input_dim = sum(len(indices) for indices in selected_neurons.values())
        self.classifier = nn.Linear(input_dim, num_labels)  # Trainable classifier

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_states = outputs.hidden_states

        selected_features = []
        for layer, indices in self.selected_neurons.items():
            layer_features = hidden_states[layer][:, :, indices]
            selected_features.append(layer_features)

        selected_features = torch.cat(selected_features, dim=-1)
        logits = self.classifier(selected_features)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
            return loss, logits
        return logits

##############################################
# 6. Prepare the tokenizer, datasets, and model
##############################################
bert_model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

train_dataset = TokenClassificationDataset(train_sentences, train_tags, tokenizer, tag2id, max_length=128)
test_dataset  = TokenClassificationDataset(test_sentences, test_tags, tokenizer, tag2id, max_length=128)

num_labels = len(unique_tags)

# Initialize the custom model
model = SubsetBERT(bert_model_name, selected_neurons, num_labels)

##############################################
# 7. Set up the Trainer API and start training
##############################################
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

##############################################
# 8. Save the fine-tuned model
##############################################
model_save_path = Path("./fine_tuned_subsetbert")
model_save_path.mkdir(exist_ok=True)
torch.save(model.state_dict(), model_save_path / "best_model.pth")

##############################################
# 9. Evaluate on the test set
##############################################
predictions_output = trainer.predict(test_dataset)
pred_logits = predictions_output.predictions
true_labels = predictions_output.label_ids

pred_labels = np.argmax(pred_logits, axis=-1)

# Gather predictions and true labels (ignoring label -100)
all_pred_tags = []
all_true_tags = []
for i in range(len(true_labels)):
    for j in range(len(true_labels[i])):
        if true_labels[i][j] != -100:
            all_true_tags.append(id2tag[true_labels[i][j]])
            all_pred_tags.append(id2tag[pred_labels[i][j]])

# Print classification report
report = classification_report(all_true_tags, all_pred_tags, zero_division=0)
print("\nClassification Report:\n")
print(report)







Epoch,Training Loss,Validation Loss
1,No log,7.603316
2,7.518100,7.301453
3,7.110700,7.089004
4,6.828700,6.962329
5,6.669300,6.9203



Classification Report:

              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00        45
         ADP       0.00      0.00      0.00       146
         ADV       0.00      0.00      0.00         3
         AUX       0.00      0.00      0.00        34
       CCONJ       0.00      0.00      0.00        21
         DET       0.00      0.00      0.00        47
        NOUN       0.00      0.00      0.00       284
         NUM       0.00      0.00      0.00        33
        PART       0.02      1.00      0.03        16
        PRON       0.00      0.00      0.00        37
       PROPN       0.00      0.00      0.00        99
       PUNCT       0.00      0.00      0.00       115
       SCONJ       0.00      0.00      0.00        16
        VERB       0.00      0.00      0.00        94

    accuracy                           0.02       990
   macro avg       0.00      0.07      0.00       990
weighted avg       0.00      0.02      0.00       990



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
import pickle
from conllu import parse_incr  # For parsing CoNLL-U files
from pathlib import Path
from sklearn.metrics import classification_report
import numpy as np

# Load the top 50 neuron indices
pkl_path = "/content/probeless_ranking.pkl"
with open(pkl_path, "rb") as f:
    ranking = pickle.load(f)

top_50_indices = ranking[:50]
hidden_dim = 768
selected_neurons = {}
for idx in top_50_indices:
    layer = idx // hidden_dim
    neuron_idx = idx % hidden_dim
    if layer not in selected_neurons:
        selected_neurons[layer] = []
    selected_neurons[layer].append(neuron_idx)

# Function to read CoNLL-U files
def read_conllu(file_path):
    sentences = []
    with open(file_path, "r", encoding="utf-8") as data_file:
        for tokenlist in parse_incr(data_file):
            words = []
            pos_tags = []
            for token in tokenlist:
                if token["form"] is not None:
                    words.append(token["form"])
                    pos_tags.append(token.get("upos", "X"))
            if words:
                sentences.append((words, pos_tags))
    return sentences

# Read data
train_data = read_conllu("/content/mr_ufal-um-train.conllu")
test_data  = read_conllu("/content/bho_bhtb-um-test.conllu")

train_sentences, train_tags = zip(*train_data)
test_sentences, test_tags   = zip(*test_data)

unique_tags = list(set(tag for sent in train_tags for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

# Encode tags
def encode_tags(tags, word_ids, tag2id):
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id[tag])
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

# Dataset class
class TokenClassificationDataset(Dataset):
    def __init__(self, sentences, tags, tokenizer, tag2id, max_length=128):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encodings = tokenizer(list(sentences), is_split_into_words=True,
                                   padding=True, truncation=True, max_length=max_length,
                                   return_tensors="pt")
        all_labels = []
        for i, tag_seq in enumerate(tags):
            word_ids = self.encodings.word_ids(batch_index=i)
            encoded_tags = encode_tags(tag_seq, word_ids, tag2id)
            all_labels.append(encoded_tags)
        self.labels = all_labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Custom SubsetBERT model
class SubsetBERT(nn.Module):
    def __init__(self, bert_model_name, selected_neurons, num_labels):
        super(SubsetBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name, output_hidden_states=True)
        self.selected_neurons = selected_neurons

        # Freeze all layers first
        for param in self.bert.parameters():
            param.requires_grad = False

        # Unfreeze last 6 layers
        for layer in range(-6, 0):
            for param in self.bert.encoder.layer[layer].parameters():
                param.requires_grad = True

        # Compute input dimension from selected neurons
        input_dim = sum(len(indices) for indices in selected_neurons.values())
        self.classifier = nn.Linear(input_dim, num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_states = outputs.hidden_states

        selected_features = []
        for layer, indices in self.selected_neurons.items():
            layer_features = hidden_states[layer][:, :, indices]
            selected_features.append(layer_features)

        selected_features = torch.cat(selected_features, dim=-1)
        logits = self.classifier(selected_features)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
            return loss, logits
        return logits

# Prepare tokenizer, datasets, and model
bert_model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

train_dataset = TokenClassificationDataset(train_sentences, train_tags, tokenizer, tag2id, max_length=128)
test_dataset  = TokenClassificationDataset(test_sentences, test_tags, tokenizer, tag2id, max_length=128)

num_labels = len(unique_tags)
model = SubsetBERT(bert_model_name, selected_neurons, num_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

# Save the fine-tuned model
model_save_path = Path("./fine_tuned_subsetbert")
model_save_path.mkdir(exist_ok=True)
torch.save(model.state_dict(), model_save_path / "best_model.pth")

# Evaluate on the test set
predictions_output = trainer.predict(test_dataset)
pred_logits = predictions_output.predictions
true_labels = predictions_output.label_ids

pred_labels = np.argmax(pred_logits, axis=-1)

# Gather predictions and true labels
all_pred_tags = []
all_true_tags = []
for i in range(len(true_labels)):
    for j in range(len(true_labels[i])):
        if true_labels[i][j] != -100:
            all_true_tags.append(id2tag[true_labels[i][j]])
            all_pred_tags.append(id2tag[pred_labels[i][j]])

# Print classification report
report = classification_report(all_true_tags, all_pred_tags, zero_division=0)
print("\nClassification Report:\n")
print(report)




Epoch,Training Loss,Validation Loss
1,No log,2.319722
2,2.169900,2.079341
3,0.971900,2.163758
4,0.697800,2.086117
5,0.547300,2.09967



Classification Report:

              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00        45
         ADP       0.71      0.35      0.47       146
         ADV       0.00      0.00      0.00         3
         AUX       0.29      0.06      0.10        34
       CCONJ       0.00      0.00      0.00        21
         DET       0.25      0.11      0.15        47
        NOUN       0.60      0.66      0.63       284
         NUM       1.00      0.03      0.06        33
        PART       0.00      0.00      0.00        16
        PRON       0.33      0.11      0.16        37
       PROPN       0.67      0.08      0.14        99
       PUNCT       0.86      0.89      0.87       115
       SCONJ       0.00      0.00      0.00        16
        VERB       0.34      0.79      0.47        94
           X       0.00      0.00      0.00         0
           _       0.00      0.00      0.00         0

    accuracy                           0.44       990
 

In [None]:
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from collections import Counter, defaultdict
from conllu import parse
from imblearn.over_sampling import RandomOverSampler

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

# Function to read .conllu files and extract (word, pos) pairs
def read_conllu(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    parsed_data = parse(data)

    sentences = []
    for sentence in parsed_data:
        word_tag_pairs = [(token["form"], token["upos"]) for token in sentence if token["form"] is not None]
        if word_tag_pairs:
            sentences.append(word_tag_pairs)
    return sentences

# Load training (Marathi) and test (Bhojpuri) data
marathi_train_data = read_conllu("/content/mr_ufal-um-train.conllu")
bhojpuri_test_data = read_conllu("/content/bho_bhtb-um-test.conllu")

# Extract sentences and POS tags
marathi_sentences, marathi_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in marathi_train_data])
bhojpuri_sentences, bhojpuri_labels = zip(*[(list(zip(*sent))[0], list(zip(*sent))[1]) for sent in bhojpuri_test_data])

# Create mapping of POS tags to IDs
unique_tags = list(set(tag for sent in marathi_labels for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

# 🔥 Apply Oversampling (Word-Level) to Balance Marathi Training Data
word_tag_pairs = [(word, tag) for sent, tags in zip(marathi_sentences, marathi_labels) for word, tag in zip(sent, tags)]
words, tags = zip(*word_tag_pairs)
words = np.array(words).reshape(-1, 1)
tags = np.array(tags)

sampling_strategy = {tag: max(50, count) for tag, count in Counter(tags).items()}
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
words_resampled, tags_resampled = ros.fit_resample(words, tags)

# Reconstruct sentences after oversampling
tagged_sentences = defaultdict(list)
for word, tag in zip(words_resampled.flatten(), tags_resampled):
    tagged_sentences[tag].append(word)

marathi_sentences_resampled = []
marathi_labels_resampled = []
for tag, word_list in tagged_sentences.items():
    marathi_sentences_resampled.append(word_list)
    marathi_labels_resampled.append([tag] * len(word_list))

# Function to encode tags with subword tokenization handling
def encode_tags(tags, word_ids, tag2id):
    encoded_tags = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            encoded_tags.append(-100)
        elif word_idx != previous_word_idx:
            tag = tags[word_idx]
            encoded_tags.append(tag2id.get(tag, -100))
        else:
            encoded_tags.append(-100)
        previous_word_idx = word_idx
    return encoded_tags

# Define POSDataset class
class POSDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, tag2id):
        self.encodings = tokenizer(list(sentences), is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")
        self.labels = []
        for i, label in enumerate(labels):
            word_ids = self.encodings.word_ids(batch_index=i)
            self.labels.append(encode_tags(label, word_ids, tag2id))
        self.labels = torch.tensor(self.labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings.keys()}
        item["labels"] = self.labels[idx]
        return item

# Prepare training and testing datasets
train_dataset = POSDataset(marathi_sentences_resampled, marathi_labels_resampled, tokenizer, tag2id)
test_dataset = POSDataset(bhojpuri_sentences, bhojpuri_labels, tokenizer, tag2id)

# Load BERT model with a classification head
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(unique_tags))

# 🔄 Freeze all BERT parameters except classifier head
for param in model.bert.parameters():
    param.requires_grad = False

# 🔓 Unfreeze the last 4 layers for fine-tuning
for layer in model.bert.encoder.layer[-6:]:  # Last 4 layers
    for param in layer.parameters():
        param.requires_grad = True

# Training arguments with learning rate scheduling
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,  # Increased for better generalization
    per_device_train_batch_size=8,  # Increased batch size
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  # Adjusted for stability
    report_to=[]  # Disable WandB logging
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate model on Bhojpuri test set
model.eval()
test_loader = DataLoader(test_dataset, batch_size=8)

predicted_labels = []
true_labels = []

for batch in test_loader:
    with torch.no_grad():
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()

    for pred_sentence, true_sentence in zip(predictions, batch["labels"].tolist()):
        for pred_token, true_token in zip(pred_sentence, true_sentence):
            if true_token != -100:
                predicted_labels.append(id2tag[pred_token])
                true_labels.append(id2tag[true_token])

# Print final classification report
print("\n🔎 **Evaluation Results on Bhojpuri Test Set:**")
print(classification_report(true_labels, predicted_labels, zero_division=0))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.930538


Epoch,Training Loss,Validation Loss
1,No log,2.930538
2,No log,2.930072
3,No log,2.929657
4,3.000200,2.929401
5,3.000200,2.928984



🔎 **Evaluation Results on Bhojpuri Test Set:**
              precision    recall  f1-score   support

         ADJ       0.03      0.09      0.05        47
         ADP       0.00      0.00      0.00       147
         ADV       0.00      0.00      0.00         3
         AUX       0.06      0.12      0.08        34
       CCONJ       0.03      0.29      0.06        21
         DET       0.01      0.02      0.01        47
        INTJ       0.00      0.00      0.00         0
        NOUN       0.38      0.02      0.04       286
         NUM       0.00      0.00      0.00        33
        PART       0.00      0.00      0.00        16
        PRON       0.00      0.00      0.00        37
       PROPN       0.10      0.18      0.12       110
       PUNCT       0.00      0.00      0.00       121
       SCONJ       0.04      0.06      0.05        16
        VERB       0.07      0.01      0.02        95
           X       0.00      0.00      0.00         0
           _       0.00      0.00