In [1]:
!pip install transformers datasets seqeval
!pip install torch torchvision torchaudio


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.5 MB/s[0m eta [3

In [3]:
import pandas as pd

# Load the labeled data in CoNLL format
file_path = '/content/labeled_telegram_data.txt'

# Read the file, assuming each token and its label are on a new line, and messages are separated by blank lines
def load_conll_data(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == "":  # blank line indicates end of sentence
                if sentence:
                    sentences.append(sentence)
                sentence = []
            else:
                token, label = line.strip().split()  # split token and label
                sentence.append((token, label))
    return sentences

data = load_conll_data(file_path)

# Convert to pandas DataFrame
df = pd.DataFrame([(token, label) for sentence in data for token, label in sentence], columns=['Token', 'Label'])
df.head()


Unnamed: 0,Token,Label
0,puma,B-PRODUCT
1,CTL,I-PRODUCT
2,SIZE,O
3,40,B-PRICE
4,41,B-PRICE


In [4]:
from transformers import AutoTokenizer

# Load XLM-Roberta tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [5]:
def tokenize_and_align_labels(sentences):
    tokenized_inputs = []
    labels = []

    for sentence in sentences:
        words = [word[0] for word in sentence]
        label = [word[1] for word in sentence]

        # Tokenize the sentence and align the labels
        tokenized_output = tokenizer(words, truncation=True, padding='max_length', is_split_into_words=True, return_tensors='pt')
        word_ids = tokenized_output.word_ids()

        # Create the aligned labels (using the word_ids)
        aligned_labels = []
        for i in range(len(word_ids)):
            if word_ids[i] is None:
                aligned_labels.append(-100)  # -100 means no label for special tokens
            else:
                aligned_labels.append(label[word_ids[i]])

        tokenized_inputs.append(tokenized_output)
        labels.append(aligned_labels)

    return tokenized_inputs, labels

tokenized_inputs, labels = tokenize_and_align_labels(data)


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',  # output directory for model checkpoints
    evaluation_strategy="epoch",  # evaluate the model after each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # number of training epochs
    weight_decay=0.01,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,
)




In [8]:
from datasets import Dataset

# Create a label-to-ID mapping
labels = list(set([label for sentence in data for _, label in sentence]))
label2id = {label: idx for idx, label in enumerate(sorted(labels))}
id2label = {idx: label for label, idx in label2id.items()}

# Now, update the label encoding to use integers
def encode_labels(sentences):
    encoded_sentences = []
    for sentence in sentences:
        encoded_sentence = [(token, label2id[label]) for token, label in sentence]
        encoded_sentences.append(encoded_sentence)
    return encoded_sentences

# Apply label encoding to your data
encoded_data = encode_labels(data)

# Tokenize the data and align the labels with tokens
tokenized_inputs, encoded_labels = tokenize_and_align_labels(encoded_data)

# Prepare dataset
train_dataset = Dataset.from_dict({
    'input_ids': [input['input_ids'][0] for input in tokenized_inputs],
    'attention_mask': [input['attention_mask'][0] for input in tokenized_inputs],
    'labels': encoded_labels
})

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Check the dataset
train_dataset[0]


{'input_ids': tensor([     0, 143926,    313,  41445,  10000,  19157,   1112,   6478,   4828,
           6260,   5896,    276,  61058,   5947,  23385, 124236,    159,  19412,
          11217,  23856,  45029, 221148,   3894,  19308,   9185,  13307,  10057,
              2,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1

In [9]:
from transformers import AutoModelForTokenClassification

# Load the pre-trained model for token classification (NER)
model = AutoModelForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=len(set([label for sentence in labels for label in sentence])))

# We also need to map the label names to numeric values, so we'll update the model config
model.config.label2id = {label: i for i, label in enumerate(sorted(set([label for sentence in labels for label in sentence])))}
model.config.id2label = {i: label for i, label in enumerate(sorted(set([label for sentence in labels for label in sentence])))}


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",              # Output directory
    evaluation_strategy="no",            # Disable evaluation during training
    learning_rate=2e-5,                  # Example learning rate
    per_device_train_batch_size=16,      # Example batch size
    num_train_epochs=3,                  # Number of training epochs
    weight_decay=0.01,                   # Weight decay for regularization
    report_to=[],                        # Disable W&B logging
    disable_tqdm=True                    # Disable progress bar
)

trainer = Trainer(
    model=model,                         # The model you want to fine-tune
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # The training dataset
)

trainer.train()




{'train_runtime': 48.4019, 'train_samples_per_second': 3.099, 'train_steps_per_second': 0.248, 'train_loss': 1.7645230293273926, 'epoch': 3.0}


TrainOutput(global_step=12, training_loss=1.7645230293273926, metrics={'train_runtime': 48.4019, 'train_samples_per_second': 3.099, 'train_steps_per_second': 0.248, 'train_loss': 1.7645230293273926, 'epoch': 3.0})

In [24]:
from sklearn.model_selection import train_test_split

# Split the data into train and validation sets (90% train, 10% validation)
train_data, val_data = train_test_split(encoded_data, test_size=0.1, random_state=42)

# Tokenize the validation set (same process as for training)
tokenized_val_inputs, val_labels = tokenize_and_align_labels(val_data)

# Prepare the validation dataset
val_dataset = Dataset.from_dict({
    'input_ids': [input['input_ids'][0] for input in tokenized_val_inputs],
    'attention_mask': [input['attention_mask'][0] for input in tokenized_val_inputs],
    'labels': val_labels
})

val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [25]:
trainer = Trainer(
    model=model,                         # The model you want to fine-tune
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # The training dataset
    eval_dataset=val_dataset             # The validation dataset
)

trainer.train()


{'train_runtime': 127.0214, 'train_samples_per_second': 1.181, 'train_steps_per_second': 0.094, 'train_loss': 0.9405362606048584, 'epoch': 3.0}


TrainOutput(global_step=12, training_loss=0.9405362606048584, metrics={'train_runtime': 127.0214, 'train_samples_per_second': 1.181, 'train_steps_per_second': 0.094, 'train_loss': 0.9405362606048584, 'epoch': 3.0})

In [26]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.8081637620925903, 'eval_runtime': 0.1804, 'eval_samples_per_second': 27.719, 'eval_steps_per_second': 5.544, 'epoch': 3.0}
{'eval_loss': 0.8081637620925903, 'eval_runtime': 0.1804, 'eval_samples_per_second': 27.719, 'eval_steps_per_second': 5.544, 'epoch': 3.0}


In [27]:
model.save_pretrained('./xlm-roberta-ner')
tokenizer.save_pretrained('./xlm-roberta-ner')


('./xlm-roberta-ner/tokenizer_config.json',
 './xlm-roberta-ner/special_tokens_map.json',
 './xlm-roberta-ner/sentencepiece.bpe.model',
 './xlm-roberta-ner/added_tokens.json',
 './xlm-roberta-ner/tokenizer.json')