In [None]:
!pip install simpletransformers
#!pip install tensorboard

In [None]:
# Load drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Needed only for XLM model
!pip install sacremoses

#### Importing Necessary Libraries

In [None]:
# Libraries
#import sacremoses
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel,NERArgs
#from torch.utils.tensorboard import SummaryWriter
import pandas as pd

#### Loading dataset

In [None]:
path = 'path_to_dataset' # add paths corresponding to ILPRL or EBIQUITY dataset
data = pd.read_csv(path,encoding="utf-8" )

In [None]:
data.head(5)

In [None]:
data =data.fillna(method ="ffill")

In [None]:
data.head(5)

#### Encoding sentence identifiers

In [None]:
data["sentence#"] = LabelEncoder().fit_transform(data["sentence#"] )

In [None]:
data.head(5)

#### Renaming columns name to labels

In [None]:
#Note that 'pos' should be used instead of 'ner' for POS tagging task
data.rename(columns={"sentence#":"sentence_id","word":"words","ner":"labels"}, inplace =True)

In [None]:
data.tail(5)

In [None]:
data["labels"] = data["labels"].str.upper()

In [None]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2, shuffle=True)

####Building train and test data

In [None]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})
print(len(train_data),len(test_data))

In [None]:
train_data

In [None]:
label = data["labels"].unique().tolist()
label

In [None]:
element_counts = data['labels'].value_counts()
# Print the counts of each unique element
print(element_counts)

#### Model Fine-tuning

In [None]:
# Model arguments and Parameters
args = NERArgs()
args.num_train_epochs = 5
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 8
args.eval_batch_size = 8
args.save_model_every_epoch=False

In [None]:
# Change according to model eg. 'xlmroberta', 'xlm-roberta-base'. Refer to simpletransformer docs and hugging face for more models
model = NERModel('bert', 'NepBERTa/NepBERTa',labels=label,args =args,use_cuda=True,from_tf=True)# remove 'from_tf=True' for other models

In [None]:
# Training the model
model.train_model(train_data,eval_data=test_data, acc=accuracy_score, verbose=True)

#### Evaluating Model

In [None]:
# Evaluating scores
result, model_outputs, preds_list = model.eval_model(test_data, verbose=True)

In [None]:
# Final result scores
result

In [None]:
# Unique labels in test set
print(test_data['labels'].unique().tolist())
print(len(test_data['labels'].unique().tolist()))

In [None]:
# Total unique sentences in test set
unique_sentence = test_data['sentence_id'].unique().tolist()
print("Total number of unique sentences: "+ str(len(unique_sentence)))
print("Total number of predicted sentences: "+ str(len(unique_sentence)))
print(unique_sentence[:50])

In [None]:
# Group by sentence_id and aggregate labels into a list
grouped = test_data.groupby('sentence_id')['labels'].agg(list).reset_index()

# Convert the 'label' column to a nested list
actual_labels_list = grouped['labels'].tolist()



In [None]:
print(actual_labels_list[:50])

In [None]:
print(preds_list[:50])

#### Plot

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
import numpy as np

# Flatten the lists
actual_flat = [item for sublist in actual_labels_list for item in sublist]
predicted_flat = [item for sublist in preds_list for item in sublist]

# Get unique labels
labels = np.unique(actual_flat)

# Compute confusion matrix
conf_matrix = confusion_matrix(actual_flat, predicted_flat, labels=labels)

print("Confusion Matrix:")
print(conf_matrix)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define function to plot confusion matrix
def plot_confusion_matrix(cm, labels):
    plt.figure(figsize=(8, 6))
    sns.set(font_scale=1.2)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Plot confusion matrix
plot_confusion_matrix(conf_matrix, labels)
