In [1]:
!pip install transformers datasets seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.0-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0

In [2]:
from pathlib import Path
import re

def read_conll(file_path):
  file_path = Path(file_path)

  raw_text = file_path.read_text().strip()
  
  raw_docs = re.split(r'\n\t?\n', raw_text)
  token_docs = []
  tag_docs = []
  for doc in raw_docs:
    tokens = []
    tags = []
    for line in doc.split('\n\n'):
      #print(line)
      for entity in line.split("\n"):
        token, tag = entity.split()
        tokens.append(token)
        tags.append(tag)
      token_docs.append(tokens)
      tag_docs.append(tags)
  return token_docs, tag_docs

In [33]:
train_texts, train_tags = read_conll("/content/train.txt")
test_texts, test_tags = read_conll("/content/test.txt")
val_texts, val_tags = read_conll("/content/val.txt")
data_texts, data_tags = read_conll("/content/labeledDatapoints-CoNLL (2).txt")

In [34]:
unique_tags = set(label for doc in data_tags for label in doc)
label2id = {label: id for id, label in enumerate(unique_tags)}
id2label = {id: label for label, id in label2id.items()}

In [35]:
len(unique_tags)

25

In [36]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig

config = AutoConfig.from_pretrained("dbmdz/bert-base-turkish-cased")
config.label2id = label2id
config.id2label = id2label
config._num_labels = len(label2id)
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels =config._num_labels)
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

Some weights of the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initiali

In [37]:
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [38]:
test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [39]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[label2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        mask = (arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)
        doc_enc_labels[mask] = doc_labels[:np.sum(mask)]
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [40]:
train_labels = encode_tags(train_tags, train_encodings)

In [41]:
val_labels = encode_tags(val_tags, val_encodings)

In [91]:
id2label

{0: 'B-bina',
 1: 'I-site',
 2: 'B-mahalle',
 3: 'B-sehir',
 4: 'I-bulvar',
 5: 'I-diskapino',
 6: 'I-ilce',
 7: 'B-bulvar',
 8: 'I-telefonno',
 9: 'O',
 10: 'B-cadde',
 11: 'B-site',
 12: 'I-isim',
 13: 'I-sokak',
 14: 'I-bina',
 15: 'B-soyisim',
 16: 'B-ilce',
 17: 'I-sehir',
 18: 'B-telefonno',
 19: 'I-mahalle',
 20: 'B-isim',
 21: 'B-diskapino',
 22: 'I-soyisim',
 23: 'I-cadde',
 24: 'B-sokak'}

In [42]:
label2id

{'B-bina': 0,
 'I-site': 1,
 'B-mahalle': 2,
 'B-sehir': 3,
 'I-bulvar': 4,
 'I-diskapino': 5,
 'I-ilce': 6,
 'B-bulvar': 7,
 'I-telefonno': 8,
 'O': 9,
 'B-cadde': 10,
 'B-site': 11,
 'I-isim': 12,
 'I-sokak': 13,
 'I-bina': 14,
 'B-soyisim': 15,
 'B-ilce': 16,
 'I-sehir': 17,
 'B-telefonno': 18,
 'I-mahalle': 19,
 'B-isim': 20,
 'B-diskapino': 21,
 'I-soyisim': 22,
 'I-cadde': 23,
 'B-sokak': 24}

In [43]:
test_labels = encode_tags(test_tags, test_encodings)

In [44]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") 
val_encodings.pop("offset_mapping")
train_dataset = NERDataset(train_encodings, train_labels)
val_dataset = NERDataset(val_encodings, val_labels)

In [45]:
test_encodings.pop("offset_mapping")
test_dataset = NERDataset(test_encodings, test_labels)

In [46]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',    
    num_train_epochs=3,          
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=64,  
    warmup_steps=500,                
    weight_decay=0.01,            
    logging_dir='./logs',      
    logging_steps=10,
)

In [54]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [60]:
import evaluate
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels), recall.compute(predictions=predictions, references=labels), f1.compute(predictions=predictions, references=labels)

In [64]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

In [65]:
trainer.train()

***** Running training *****
  Num examples = 799
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 150
  Number of trainable parameters = 110045977


Step,Training Loss
10,0.1468
20,0.1283
30,0.123
40,0.1132
50,0.1062
60,0.1032
70,0.1047
80,0.1066
90,0.069
100,0.0748




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=150, training_loss=0.09351563652356466, metrics={'train_runtime': 57.4997, 'train_samples_per_second': 41.687, 'train_steps_per_second': 2.609, 'total_flos': 140708466884250.0, 'train_loss': 0.09351563652356466, 'epoch': 3.0})

In [66]:
results = trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 130
  Batch size = 64


In [67]:
results

{'eval_loss': 0.3053381145000458,
 'eval_runtime': 0.8092,
 'eval_samples_per_second': 160.649,
 'eval_steps_per_second': 3.707,
 'epoch': 3.0}

In [68]:
from transformers import pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")

In [None]:
compute_metrics((test_dataset.labels))

In [None]:
test_dataset.labels

In [None]:
trainer.predict(test_dataset)

In [82]:
trainer.save_model("./models")

Saving model checkpoint to ./models
Configuration saved in ./models/config.json
Model weights saved in ./models/pytorch_model.bin
tokenizer config file saved in ./models/tokenizer_config.json
Special tokens file saved in ./models/special_tokens_map.json


In [84]:
from huggingface_hub import HfApi, notebook_login

In [87]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [88]:
hf_api = HfApi()
hf_api.create_repo("merve/adres-ner")
hf_api.upload_folder(repo_id="merve/adres-ner", folder_path="models",
                     path_in_repo="./")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

'https://huggingface.co/merve/adres-ner/tree/main/./'

In [59]:
results = trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1
  Batch size = 64


In [89]:
results

{'eval_loss': 0.3053381145000458,
 'eval_runtime': 0.8092,
 'eval_samples_per_second': 160.649,
 'eval_steps_per_second': 3.707,
 'epoch': 3.0}

In [90]:
test_texts

[['EKİP',
  'LAZIM',
  'ACİL',
  'Cebrail',
  'mahallesi',
  'Cumhuriyet',
  'Caddesi',
  'No',
  '48',
  'Çinçin',
  'apartmanı',
  'Antakya',
  'Hatay',
  '0533',
  '059',
  '2039',
  '0531',
  '736',
  '0110'],
 ['LÜTFEN',
  'YAYALIM',
  'HABER',
  'ALINAMIYOR',
  'Cebrail',
  'mahallesi',
  'bahçeli',
  'sokak',
  'kent',
  'apartmanı',
  'cami',
  'yanı',
  'Hatay',
  'Antakya',
  'Havva',
  'Kuyucu',
  'Hüseyin',
  'Kuyucu',
  'Murat',
  'Kuyucu',
  'Cebrail',
  'mahallesi',
  'Cengiz',
  'Caddesi',
  'Daloğlu',
  'Apt',
  'Hatay',
  'Antakya',
  'Vahap',
  'Sever',
  'İletişim',
  'için',
  'Rüzgar',
  'Mavi',
  'Çakın',
  '05525101041'],
 ['ACİL',
  'ÇOCUKLAR',
  'ÜŞÜMESİN',
  'Gaziantep',
  'te',
  'ısıtıcı',
  'ihtiyaçları',
  'var',
  'çocuklar',
  'var',
  'evde',
  'çok',
  'zor',
  'durumdalar',
  '9',
  'kişilik',
  'bir',
  'aileler',
  'lütfen',
  'yardımcı',
  'olalım',
  'İrtibat',
  'numaraları',
  '05465619969',
  '05397966055',
  '23',
  'Nisan',
  'mah',
  '82084

In [96]:
pipe = pipeline("ner", model="merve/adres-ner", aggregation_strategy="first")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--merve--adres-ner/snapshots/16e489000f7a37a1328ca503f453f5756ab11e68/config.json
Model config BertConfig {
  "_name_or_path": "merve/adres-ner",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-bina",
    "1": "I-site",
    "2": "B-mahalle",
    "3": "B-sehir",
    "4": "I-bulvar",
    "5": "I-diskapino",
    "6": "I-ilce",
    "7": "B-bulvar",
    "8": "I-telefonno",
    "9": "O",
    "10": "B-cadde",
    "11": "B-site",
    "12": "I-isim",
    "13": "I-sokak",
    "14": "I-bina",
    "15": "B-soyisim",
    "16": "B-ilce",
    "17": "I-sehir",
    "18": "B-telefonno",
    "19": "I-mahalle",
    "20": "B-isim",
    "21": "B-diskapino",
    "22": "I-soyisim",
    "23": "I-cadde",
    "24": "B-sokak"
  },
  "initializer_

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--merve--adres-ner/snapshots/16e489000f7a37a1328ca503f453f5756ab11e68/pytorch_model.bin
All model checkpoint weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the model checkpoint at merve/adres-ner.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/755k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--merve--adres-ner/snapshots/16e489000f7a37a1328ca503f453f5756ab11e68/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--merve--adres-ner/snapshots/16e489000f7a37a1328ca503f453f5756ab11e68/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--merve--adres-ner/snapshots/16e489000f7a37a1328ca503f453f5756ab11e68/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--merve--adres-ner/snapshots/16e489000f7a37a1328ca503f453f5756ab11e68/tokenizer_config.json


In [97]:
pipe("yardım ekip yokmuş ürgenpaşa mahallesi atatürk caddesi gül apartmanı no 93 antakya hatay")

[{'entity_group': 'mahalle',
  'score': 0.7968493,
  'word': 'ürgenpaşa mahallesi',
  'start': 19,
  'end': 38},
 {'entity_group': 'cadde',
  'score': 0.99311125,
  'word': 'atatürk',
  'start': 39,
  'end': 46},
 {'entity_group': 'bina',
  'score': 0.99103534,
  'word': 'gül',
  'start': 55,
  'end': 58},
 {'entity_group': 'diskapino',
  'score': 0.96691895,
  'word': '93',
  'start': 72,
  'end': 74},
 {'entity_group': 'ilce',
  'score': 0.9905529,
  'word': 'antakya',
  'start': 75,
  'end': 82},
 {'entity_group': 'sehir',
  'score': 0.9675041,
  'word': 'hatay',
  'start': 83,
  'end': 88}]