<a href="https://colab.research.google.com/github/bahramzada/az-ner-blur/blob/main/NER_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import string
import csv
import re

# EN: Rayon (region) codes list for Azerbaijani car plates
# AZ: Az…ôrbaycan avtomobil n√∂mr…ôl…ôri √º√ß√ºn rayon kodlarƒ± siyahƒ±sƒ±
first_digit = [
    "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "14", "15", "16", "17", "18", "19", "20",
    "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
    "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58",
    "59", "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "77", "85", "90", "99"
]

# EN: Variable for rayon codes used in car plate generation
# AZ: Avtomobil n√∂mr…ôsi yaratmaq √º√ß√ºn rayon kodlarƒ± d…ôyi≈ü…ôni
RAYON_CODES = first_digit

def generate_car_plate():
    """
    EN: Generates a car plate in the format RayonCode-XX-XXX (e.g., 01-XX-001)
    AZ: Rayon kodu-XX-XXX formatƒ±nda avtomobil n√∂mr…ôsi yaradƒ±r (m…ôs…ôl…ôn, 01-XX-001)
    """
    # EN: Select a rayon code
    # AZ: Rayon kodunu se√ß
    first_digits = random.choice(RAYON_CODES)

    # EN: Generate two random uppercase letters
    # AZ: T…ôsad√ºfi iki b√∂y√ºk h…ôrf yarat
    letters = ''.join(random.choices(string.ascii_uppercase, k=2))

    # EN: Generate the last three digits (001‚Äì999), always three digits
    # AZ: Son √º√ß r…ôq…ômi (001‚Äì999), h…ômi≈ü…ô √º√ßr…ôq…ômli kimi yarat
    last_digits = f"{random.randint(1, 999):03d}"

    # EN: Return the car plate string
    # AZ: Avtomobil n√∂mr…ôsini qaytar
    return f"{first_digits}-{letters}-{last_digits}"

def generate_sentences_with_car_plates(num_sentences=1000):
    """
    EN: Generates sentences containing car plates
    AZ: Avtomobil n√∂mr…ôl…ôri olan c√ºml…ôl…ôr yaradƒ±r
    """
    # EN: Various sentence templates for generating sentences with car plates
    # AZ: Avtomobil n√∂mr…ôsi il…ô c√ºml…ô yaratmaq √º√ß√ºn m√ºxt…ôlif ≈üablonlar
    sentence_templates = [
        "Avtomobil n√∂mr…ôsi {} olan ma≈üƒ±n yolda gedirdi.",
        "{} n√∂mr…ôli avtomobil s√ºr…ôtl…ô ke√ßdi.",
        "M…ôn {} n√∂mr…ôsini g√∂rd√ºm.",
        "{} n√∂mr…ôli ma≈üƒ±n dayanacaqda idi.",
        "Polis {} n√∂mr…ôli avtomobili dayandƒ±rdƒ±.",
        "Bu g√ºn {} n√∂mr…ôsini qeyd etdim.",
        "{} n√∂mr…ôli avtomobil qƒ±rmƒ±zƒ± i≈üƒ±qda dayandƒ±.",
        "Qon≈üumun avtomobil n√∂mr…ôsi {}dƒ±r.",
        "{} n√∂mr…ôli ma≈üƒ±n √ßox s√ºr…ôtl…ô gedirdi.",
        "Avtomobil {} n√∂mr…ôsi il…ô qeydiyyatdan ke√ßib.",
        "M…ôn {} n√∂mr…ôli avtomobili tanƒ±yƒ±ram.",
        "{} n√∂mr…ôsind…ô olan ma≈üƒ±n aƒü r…ôngd…ôdir.",
        "D√ºn…ôn {} n√∂mr…ôli avtomobili g√∂rd√ºm.",
        "Bu {} n√∂mr…ôli ma≈üƒ±n kim…ô m…ôxsusdur?",
        "{} n√∂mr…ôli avtomobil yeni alƒ±nƒ±b.",
        "Parklama yerind…ô {} n√∂mr…ôsi var idi.",
        "{} n√∂mr…ôli ma≈üƒ±n t…ômir…ô ehtiyacƒ± var.",
        "Avtomobil n√∂mr…ôsi {} olan s√ºr√ºc√º t…ôcr√ºb…ôlidir.",
        "M…ônim dostumun avtomobil n√∂mr…ôsi {}dƒ±r.",
        "{} n√∂mr…ôli avtomobil bazarda satƒ±lƒ±r.",
        "Bu s…ôh…ôr {} n√∂mr…ôsini yolda g√∂rd√ºm.",
        "{} n√∂mr…ôli ma≈üƒ±n √ßox b√∂y√ºkd√ºr.",
        "Avtomobil {} n√∂mr…ôsi qara ma≈üƒ±nda yazƒ±lƒ±b.",
        "M…ôn {} n√∂mr…ôsini unutmu≈üdum.",
        "{} n√∂mr…ôli avtomobil h…ôft…ôsonu istifad…ô olunur.",
        "Bu {} avtomobil n√∂mr…ôsi √ßox maraqlƒ±dƒ±r.",
        "{} n√∂mr…ôli ma≈üƒ±n …ôla v…ôziyy…ôtd…ôdir.",
        "Avtomobil n√∂mr…ôsi {} yadda≈üƒ±mda qalƒ±b.",
        "D√ºn…ôn ax≈üam {} n√∂mr…ôli avtomobil g…ôldi.",
        "{} n√∂mr…ôsini polis…ô bildirdim.",
        "Hadis…ô yerind…ôn {} n√∂mr…ôli avtomobil uzaqla≈üdƒ±.",
        "≈û…ôh…ôr kameralarƒ± {} n√∂mr…ôli ma≈üƒ±nƒ± qeyd…ô aldƒ±.",
        "{} n√∂mr…ôsi il…ô icar…ôy…ô g√∂t√ºr√ºl…ôn avtomobil qaytarƒ±ldƒ±.",
        "T…ôhl√ºk…ôsizlik …ôm…ôkda≈üƒ± {} n√∂mr…ôsini soru≈üdu.",
        "Bu avtomobilin n√∂mr…ôsi {} olaraq qeyd edilib.",
        "Avtomobilin texniki baxƒ±≈üƒ± {} n√∂mr…ôsin…ô uyƒüundur.",
        "{} n√∂mr…ôli ma≈üƒ±n yol k…ônarƒ±nda saxlanƒ±lƒ±b.",
        "M…ôlumat bazasƒ±nda {} n√∂mr…ôsi il…ô axtarƒ±≈ü aparƒ±ldƒ±.",
        "O, √∂z avtomobilinin n√∂mr…ôsini, {} n√∂mr…ôsini xatƒ±rladƒ±.",
        "{} n√∂mr…ôli avtomobilin sahibi kimdir?",
        "K√∂m…ôk √º√ß√ºn {} n√∂mr…ôli ma≈üƒ±n √ßaƒüƒ±rƒ±ldƒ±.",
        "Avtomobil n√∂mr…ôsi {} qeydiyyatdan ke√ßdi.",
        "N…ôqliyyatƒ±n h…ôr…ôk…ôtini izl…ôm…ôk √º√ß√ºn {} n√∂mr…ôsind…ôn istifad…ô edildi.",
        "G√∂mr√ºkd…ô {} n√∂mr…ôli avtomobil yoxlanƒ±ldƒ±.",
        "Bu q…ôza il…ô baƒülƒ± {} n√∂mr…ôli avtomobilin adƒ± hallanƒ±r.",
        "{} n√∂mr…ôli ma≈üƒ±nƒ±n t…ôk…ôrl…ôri d…ôyi≈üdirildi.",
        "Avtomobilin n√∂mr…ôsi {} v…ô r…ôngi aƒüdƒ±r.",
        "Bu avtomobilin n√∂mr…ôsi {} olaraq d…ôyi≈üdiril…ôc…ôk.",
        "T…ôdbird…ô i≈ütirak ed…ôn h…ôr bir avtomobilin n√∂mr…ôsi, o c√ºml…ôd…ôn {} qeyd olundu."
    ]
    sentences = []

    # EN: Generate the requested number of sentences
    # AZ: ƒ∞st…ônil…ôn sayda c√ºml…ô yarat
    for _ in range(num_sentences):
        # EN: Select a random template
        # AZ: T…ôsad√ºfi ≈üablon se√ß
        template = random.choice(sentence_templates)
        # EN: Generate car plate
        # AZ: Avtomobil n√∂mr…ôsini yarat
        car_plate = generate_car_plate()
        # EN: Insert car plate into sentence template
        # AZ: Avtomobil n√∂mr…ôsini c√ºml…ô ≈üablonuna yerl…ô≈üdir
        sentence = template.format(car_plate)
        sentences.append(sentence)

    return sentences

def save_dataset(sentences, filename="car_plate_dataset.txt"):
    """
    EN: Saves the dataset to a text file
    AZ: Dataset-i m…ôtn faylƒ±na saxlayƒ±r
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')
    # EN: Print how many sentences were saved
    # AZ: Ne√ß…ô c√ºml…ô saxlanƒ±ldƒ±ƒüƒ±nƒ± √ßap et
    print(f"{len(sentences)} c√ºml…ô {filename} faylƒ±nda saxlanƒ±ldƒ±.")

def save_dataset_csv(sentences, filename="car_plate_dataset.csv"):
    """
    EN: Saves the dataset to a CSV file (with columns: sentence & car_plate)
    AZ: Dataset-i CSV formatƒ±nda saxlayƒ±r (s√ºtunlar: c√ºml…ô & avtomobil n√∂mr…ôsi)
    """
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['sentence', 'car_plate'])

        for sentence in sentences:
            # EN: Find car plate in the sentence using regex
            # AZ: C√ºml…ôd…ô avtomobil n√∂mr…ôsini regex il…ô tap
            car_plate_pattern = r'\d{2}-[A-Z]{2}-\d{3}'
            match = re.search(car_plate_pattern, sentence)
            if match:
                car_plate = match.group()
                writer.writerow([sentence, car_plate])

    # EN: Print confirmation about CSV saving
    # AZ: CSV saxlanƒ±lmasƒ± bar…ôd…ô t…ôsdiq √ßap et
    print(f"CSV formatƒ±nda da {filename} faylƒ±nda saxlanƒ±ldƒ±.")

def main():
    # EN: Generate 30,000 sentences with car plates
    # AZ: 30,000 avtomobil n√∂mr…ôli c√ºml…ô yarat
    sentences = generate_sentences_with_car_plates(30000)

    # EN: Display the first 10 sample sentences
    # AZ: ƒ∞lk 10 n√ºmun…ô c√ºml…ôni g√∂st…ôr
    print("N√ºmun…ô c√ºml…ôl…ôr:")
    for i, sentence in enumerate(sentences[:10], 1):
        print(f"{i}. {sentence}")

    print("\n" + "="*50)
    print(f"C…ômi {len(sentences)} c√ºml…ô yaradƒ±ldƒ±.")

    # EN: Save sentences to text file
    # AZ: C√ºml…ôl…ôri m…ôtn faylƒ±na saxla
    save_dataset(sentences)

    # EN: Save sentences also as CSV
    # AZ: C√ºml…ôl…ôri h…ôm d…ô CSV formatƒ±nda saxla
    save_dataset_csv(sentences)

if __name__ == "__main__":
    main()

N√ºmun…ô c√ºml…ôl…ôr:
1. Bu g√ºn 72-MY-215 n√∂mr…ôsini qeyd etdim.
2. N…ôqliyyatƒ±n h…ôr…ôk…ôtini izl…ôm…ôk √º√ß√ºn 58-OH-587 n√∂mr…ôsind…ôn istifad…ô edildi.
3. Avtomobil 68-CB-083 n√∂mr…ôsi qara ma≈üƒ±nda yazƒ±lƒ±b.
4. 55-RO-777 n√∂mr…ôli avtomobilin sahibi kimdir?
5. T…ôhl√ºk…ôsizlik …ôm…ôkda≈üƒ± 21-NJ-764 n√∂mr…ôsini soru≈üdu.
6. Bu g√ºn 16-GV-210 n√∂mr…ôsini qeyd etdim.
7. 42-ZP-581 n√∂mr…ôli ma≈üƒ±n √ßox s√ºr…ôtl…ô gedirdi.
8. 31-GX-041 n√∂mr…ôli ma≈üƒ±n √ßox b√∂y√ºkd√ºr.
9. Avtomobil n√∂mr…ôsi 77-OG-934 qeydiyyatdan ke√ßdi.
10. T…ôdbird…ô i≈ütirak ed…ôn h…ôr bir avtomobilin n√∂mr…ôsi, o c√ºml…ôd…ôn 46-NE-732 qeyd olundu.

C…ômi 30000 c√ºml…ô yaradƒ±ldƒ±.
30000 c√ºml…ô car_plate_dataset.txt faylƒ±nda saxlanƒ±ldƒ±.
CSV formatƒ±nda da car_plate_dataset.csv faylƒ±nda saxlanƒ±ldƒ±.


In [None]:
import csv
import json
import re

def find_car_plate_positions(text):
    """
    EN: Finds the positions of car plates in the text
    AZ: M…ôtnd…ô avtomobil n√∂mr…ôl…ôrinin m√∂vqel…ôrini tapƒ±r
    """
    car_plate_pattern = r'\d{2}-[A-Z]{2}-\d{3}'
    entities = []

    # EN: Look for all matches of the car plate pattern in the text
    # AZ: M…ôtnd…ô avtomobil n√∂mr…ôsi ≈üablonuna uyƒüun b√ºt√ºn uyƒüunluqlarƒ± tap
    for match in re.finditer(car_plate_pattern, text):
        start_pos = match.start()
        end_pos = match.end()
        # EN: Add the entity as [start, end, "CAR_PLATE"]
        # AZ: Entitiyi [start, end, "CAR_PLATE"] formatƒ±nda …ôlav…ô et
        entities.append([start_pos, end_pos, "CAR_PLATE"])

    return entities

def annotate_csv_file(input_csv_file, output_json_file):
    """
    EN: Reads a CSV file and annotates it in JSON format
    AZ: CSV faylƒ±nƒ± oxuyub JSON formatƒ±nda annotate edir
    """
    annotated_data = []

    # EN: Read the CSV file
    # AZ: CSV faylƒ±nƒ± oxu
    with open(input_csv_file, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)

        # EN: Skip header if present, but also process if it's not a header
        # AZ: Header-i atla (…ôg…ôr varsa), amma header deyils…ô, onu da emal et
        try:
            first_row = next(reader)
            # EN: If the first row is not a header, process it as a sentence
            # AZ: ∆èg…ôr ilk s…ôtir header deyils…ô, onu da c√ºml…ô kimi emal et
            if not (first_row[0].lower() in ['sentence', 'text', 'c√ºmle']):
                sentence = first_row[0]
                entities = find_car_plate_positions(sentence)
                annotation = {"entities": entities}
                annotated_data.append([sentence, annotation])
        except StopIteration:
            print("CSV faylƒ± bo≈üdur!")
            return

        # EN: Process the remaining rows
        # AZ: Qalan s…ôtirl…ôri emal et
        for row in reader:
            if row:  # EN: Skip empty rows | AZ: Bo≈ü s…ôtirl…ôri atla
                sentence = row[0]  # EN: Take the first column as sentence | AZ: ƒ∞lk s√ºtunu c√ºml…ô kimi g√∂t√ºr
                entities = find_car_plate_positions(sentence)
                annotation = {"entities": entities}
                annotated_data.append([sentence, annotation])

    # EN: Save the annotated data as JSON
    # AZ: Annotate edilmi≈ü m…ôlumatƒ± JSON faylƒ±nda saxla
    with open(output_json_file, 'w', encoding='utf-8') as f:
        json.dump(annotated_data, f, ensure_ascii=False, indent=2)

    print(f"{len(annotated_data)} c√ºml…ô annotate edildi v…ô {output_json_file} faylƒ±nda saxlanƒ±ldƒ±.")

    return annotated_data

def print_sample_results(data, num_samples=5):
    """
    EN: Print sample annotated results
    AZ: N√ºmun…ô annotate edilmi≈ü n…ôtic…ôl…ôri g√∂st…ôr
    """
    print(f"\nN√ºmun…ô {min(num_samples, len(data))} annotated c√ºml…ô:")
    print("="*80)

    for i, (sentence, annotation) in enumerate(data[:num_samples]):
        print(f"{i+1}. {sentence}")
        if annotation['entities']:
            for entity in annotation['entities']:
                start, end, label = entity
                car_plate = sentence[start:end]
                # EN: Print found car plate and its position
                # AZ: Tapƒ±lan avtomobil n√∂mr…ôsini v…ô m√∂vqeyini g√∂st…ôr
                print(f"   -> Tapƒ±lan: '{car_plate}' (m√∂vqe: {start}-{end})")
        else:
            print("   -> Avtomobil n√∂mr…ôsi tapƒ±lmadƒ±")
        print("-" * 40)

def main():
    # EN: Ask for CSV file name from user
    # AZ: ƒ∞stifad…ô√ßid…ôn CSV fayl adƒ±nƒ± soru≈ü
    input_csv = input("CSV fayl adƒ±nƒ± daxil edin (m…ôs: sentences.csv): ").strip()
    if not input_csv:
        input_csv = "/content/car_plate_dataset.csv"

    # EN: Set the output JSON file name
    # AZ: Output JSON fayl adƒ±nƒ± t…ôyin et
    output_json = input_csv.replace('.csv', '_annotated.json')

    try:
        # EN: Annotate the CSV file
        # AZ: CSV-ni annotate et
        data = annotate_csv_file(input_csv, output_json)

        # EN: Print sample results
        # AZ: N√ºmun…ô n…ôtic…ôl…ôri g√∂st…ôr
        print_sample_results(data)

        print(f"\n‚úÖ Uƒüurla tamamlandƒ±!")
        print(f"üìÅ Input: {input_csv}")
        print(f"üìÅ Output: {output_json}")

    except FileNotFoundError:
        print(f"‚ùå X∆èTA: '{input_csv}' faylƒ± tapƒ±lmadƒ±!")
    except Exception as e:
        print(f"‚ùå X∆èTA: {str(e)}")

if __name__ == "__main__":
    main()

CSV fayl adƒ±nƒ± daxil edin (m…ôs: sentences.csv): 
30000 c√ºml…ô annotate edildi v…ô /content/car_plate_dataset_annotated.json faylƒ±nda saxlanƒ±ldƒ±.

N√ºmun…ô 5 annotated c√ºml…ô:
1. Bu g√ºn 72-MY-215 n√∂mr…ôsini qeyd etdim.
   -> Tapƒ±lan: '72-MY-215' (m√∂vqe: 7-16)
----------------------------------------
2. N…ôqliyyatƒ±n h…ôr…ôk…ôtini izl…ôm…ôk √º√ß√ºn 58-OH-587 n√∂mr…ôsind…ôn istifad…ô edildi.
   -> Tapƒ±lan: '58-OH-587' (m√∂vqe: 36-45)
----------------------------------------
3. Avtomobil 68-CB-083 n√∂mr…ôsi qara ma≈üƒ±nda yazƒ±lƒ±b.
   -> Tapƒ±lan: '68-CB-083' (m√∂vqe: 10-19)
----------------------------------------
4. 55-RO-777 n√∂mr…ôli avtomobilin sahibi kimdir?
   -> Tapƒ±lan: '55-RO-777' (m√∂vqe: 0-9)
----------------------------------------
5. T…ôhl√ºk…ôsizlik …ôm…ôkda≈üƒ± 21-NJ-764 n√∂mr…ôsini soru≈üdu.
   -> Tapƒ±lan: '21-NJ-764' (m√∂vqe: 23-32)
----------------------------------------

‚úÖ Uƒüurla tamamlandƒ±!
üìÅ Input: /content/car_plate_dataset.csv
üìÅ Output: /c

In [None]:
import random

def random_aze_id():
    """
    EN: Generates a random Azerbaijani ID in the format AZE + 9 digits
    AZ: T…ôsad√ºfi Az…ôrbaycan ≈ü…ôxsiyy…ôt v…ôsiq…ôsi n√∂mr…ôsi yaradƒ±r (AZE + 9 r…ôq…ôm)
    """
    digits = ''.join([str(random.randint(0, 9)) for _ in range(9)])
    return "AZE" + digits

def random_aa_id():
    """
    EN: Generates a random AA ID in the format AA + 7 digits
    AZ: T…ôsad√ºfi AA ≈ü…ôxsiyy…ôt v…ôsiq…ôsi n√∂mr…ôsi yaradƒ±r (AA + 7 r…ôq…ôm)
    """
    digits = ''.join([str(random.randint(0, 9)) for _ in range(7)])
    return "AA" + digits

# EN: Sentence templates with ID placeholders
# AZ: ≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi √º√ß√ºn c√ºml…ô ≈üablonlarƒ±
templates = [
    "M…ônim ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôm {}-dir.",
    "≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi {}-dir.",
    "V…ôsiq…ô n√∂mr…ôm {}-dir.",
    "Z…ôhm…ôt olmasa, ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ônizi qeyd edin: {}",
    "Adƒ±: Bahram Zada, V…ôsiq…ô n√∂mr…ôsi: {}",
    "≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi: {}",
    "M…ônim {} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ômdir.",
    "≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsini yoxlamaq √º√ß√ºn {} daxil edin.",
    "V…ôsiq…ô n√∂mr…ôsi olmadan qeydiyyat m√ºmk√ºn deyil: {}",
    "Sizin ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôniz {}-d√ºr?",
    "Qeydiyyat √º√ß√ºn ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi t…ôl…ôb olunur: {}",
    "ƒ∞stifad…ô√ßinin ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi: {}",
    "A≈üaƒüƒ±da g√∂st…ôril…ôn v…ôsiq…ô n√∂mr…ôsini yoxlayƒ±n: {}",
    "Sistem…ô giri≈ü √º√ß√ºn {} v…ôsiq…ô n√∂mr…ôsini daxil edin.",
    "S…ôn…ôd m…ôlumatlarƒ±: ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi - {}",
    "{} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsini t…ôsdiql…ôyin.",
    "Yuxarƒ±da qeyd olunan {} v…ôsiq…ô n√∂mr…ôsidir.",
    "∆èlav…ô m…ôlumat √º√ß√ºn {} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsini istifad…ô edin.",
    "≈û…ôxsiyy…ôtinizi t…ôsdiql…ôm…ôk √º√ß√ºn {} n√∂mr…ôsini yazƒ±n.",
    "∆èg…ôr ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôniz {}-dirs…ô, davam edin.",
    "Qeydiyyat zamanƒ± istifad…ô etdiyiniz v…ôsiq…ô n√∂mr…ôsi: {}",
    "≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôniz bir daha t…ôsdiql…ônir: {}",
    "Formada yazƒ±lan ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi {}-dir.",
    "Sizd…ôn t…ôl…ôb olunan ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi: {}",
    "A≈üaƒüƒ±da g√∂st…ôril…ôn {} n√∂mr…ôsi sizin v…ôsiq…ônizdir.",
    "Profilinizd…ô qeyd olunan ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi: {}",
    "≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi olmadan qeydiyyat m√ºmk√ºn deyil, n√∂mr…ôniz: {}",
    "Sistem…ô daxil olmaq √º√ß√ºn ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ônizi {} daxil edin.",
    "Qeydiyyat formasƒ±nda {} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsini yazƒ±n.",
    "M…ôlumat bazasƒ±nda saxlanƒ±lan v…ôsiq…ô n√∂mr…ôsi: {}",
    "M√ºraci…ôt √º√ß√ºn {} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi vacibdir.",
    "Yalnƒ±z {} n√∂mr…ôsi olan ≈ü…ôxs xidm…ôtd…ôn yararlana bil…ôr.",
    "{} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi il…ô …ôlaq…ôli s…ôn…ôdl…ôr q…ôbul edildi.",
    "√ñd…ôni≈üi etm…ôk √º√ß√ºn {} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ônizi g√∂st…ôrin.",
    "∆èriz…ôy…ô {} v…ôsiq…ô n√∂mr…ôsi il…ô m√ºraci…ôt ed…ô bil…ôrsiniz.",
    "H…ôr hansƒ± bir d…ôyi≈üiklik √º√ß√ºn {} n√∂mr…ôsi t…ôl…ôb olunur.",
    "{} n√∂mr…ôsi yoxlandƒ± v…ô t…ôsdiq edildi.",
    "Bu hesabƒ±n sahibi {} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsin…ô malikdir.",
    "V…ôsiq…ônin √ºz…ôrind…ô {} n√∂mr…ôsi qeyd olunub.",
    "{} n√∂mr…ôsi il…ô baƒülƒ± b√ºt√ºn m…ôlumatlar doƒürudur.",
    "Giri≈ü √º√ß√ºn {} ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsini yenid…ôn daxil edin.",
    "L√ºtf…ôn, {} n√∂mr…ôsini …ôlav…ô edin.",
    "Bu ≈ü…ôxsiyy…ôt v…ôsiq…ôsi n√∂mr…ôsi, {} yeni verilmi≈üdir.",
    "Sƒ±ƒüorta …ôm…ôliyyatƒ± {} n√∂mr…ôsin…ô …ôsaslanƒ±r.",
    "T…ôsdiql…ôm…ô kodu {} v…ôsiq…ô n√∂mr…ôniz…ô g√∂nd…ôrildi.",
    "{} v…ôsiq…ô n√∂mr…ôsi m…ôlumat bazasƒ±na daxil edilmi≈üdir.",
    "Xidm…ôt haqqƒ±nƒ± √∂d…ôm…ôk √º√ß√ºn {} n√∂mr…ôsi lazƒ±mdƒ±r.",
    "Sistemd…ô {} n√∂mr…ôsi il…ô baƒülƒ± he√ß bir qeyd tapƒ±lmadƒ±.",
    "√ñd…ôni≈üin t…ôsdiqi √º√ß√ºn {} n√∂mr…ôsini g√∂st…ôrin.",
    "{} n√∂mr…ôsi il…ô baƒülƒ± b√ºt√ºn m…ôlumatlar qorunur."
]

def generate_sentences(n=1000, aze_ratio=0.7):
    """
    EN: Generates sentences with random ID numbers (AZE or AA format)
    AZ: T…ôsad√ºfi ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôl…ôri il…ô c√ºml…ôl…ôr yaradƒ±r (AZE v…ô ya AA formatƒ±)
    """
    sentences = []
    for _ in range(n):
        template = random.choice(templates)
        # EN: 70% AZE format, 30% AA format
        # AZ: 70% AZE formatƒ±, 30% AA formatƒ±
        if random.random() < aze_ratio:
            id_num = random_aze_id()
        else:
            id_num = random_aa_id()
        sentence = template.format(id_num)
        sentences.append(sentence)
    return sentences

if __name__ == "__main__":
    # EN: Generate 30,000 sentences and save to file
    # AZ: 30,000 c√ºml…ô yaradƒ±n v…ô fayla yazƒ±n
    sentences = generate_sentences(30000)
    with open("vesiqe_numune.txt", "w", encoding="utf-8") as f:
        for sentence in sentences:
            f.write(sentence + "\n")

In [None]:
import re
import json

# EN: Regex patterns for ID numbers
# AZ: ≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôl…ôri √º√ß√ºn regex pattern-l…ôr
pattern_aze = r'AZE\d{9}'         # EN: AZE + 9 digits | AZ: AZE + 9 r…ôq…ôm
pattern_aa = r'AA\d{7}'           # EN: AA + 7 digits  | AZ: AA + 7 r…ôq…ôm

label = "ID_NUMBER"  # EN: Entity label for ID numbers | AZ: ≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsi √º√ß√ºn entiti label

def label_sentences(sentences):
    """
    EN: Labels AZE and AA ID numbers in sentences with their positions
    AZ: C√ºml…ôl…ôrd…ô AZE v…ô AA ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôl…ôrinin m√∂vqel…ôrini annotate edir
    """
    labeled_data = []
    for sentence in sentences:
        entities = []
        # EN: For AZE IDs
        # AZ: AZE n√∂mr…ôl…ôri √º√ß√ºn
        for match in re.finditer(pattern_aze, sentence):
            start, end = match.start(), match.end()
            entities.append((start, end, label))
        # EN: For AA IDs
        # AZ: AA n√∂mr…ôl…ôri √º√ß√ºn
        for match in re.finditer(pattern_aa, sentence):
            start, end = match.start(), match.end()
            entities.append((start, end, label))
        # EN: If any entity is found, add to the labeled data
        # AZ: ∆èg…ôr entitil…ôr tapƒ±lƒ±bsa, annotate edilmi≈ü veril…ônl…ôr…ô …ôlav…ô et
        if entities:
            labeled_data.append((sentence, {"entities": entities}))
    return labeled_data

# EN: Read sentences from file
# AZ: Fayldan c√ºml…ôl…ôri oxu
with open("vesiqe_numune.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f.readlines()]

# EN: Label sentences with ID numbers
# AZ: C√ºml…ôl…ôri ≈ü…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôl…ôri il…ô annotate et
labeled_data = label_sentences(sentences)

# EN: Write labeled data to annotation format JSON
# AZ: Annotasiya formatlƒ± JSON-a yaz
with open("vesiqe_annotated.json", "w", encoding="utf-8") as f:
    json.dump(labeled_data, f, ensure_ascii=False, indent=2)

In [None]:
import random

def random_fin():
    """
    EN: Generates a random FIN code (7 characters, excluding 'I' and 'O')
    AZ: T…ôsad√ºfi Fƒ∞N kodu yaradƒ±r (7 simvol, 'I' v…ô 'O' istisna olmaqla)
    """
    # EN: Allowed letters are from English alphabet except 'I' and 'O'
    # AZ: ƒ∞caz…ôli h…ôrfl…ôr ƒ∞ngilis …ôlifbasƒ±ndan, amma 'I' v…ô 'O' olmamalƒ±dƒ±r
    letters = [chr(c) for c in range(ord('A'), ord('Z')+1) if chr(c) not in ['I', 'O']]
    digits = [str(d) for d in range(10)]
    allowed = letters + digits
    # EN: FIN code is 7 characters, randomly chosen from allowed characters
    # AZ: Fƒ∞N kodu icaz…ôli simvollardan t…ôsad√ºfi se√ßilmi≈ü 7 simvoldan ibar…ôtdir
    fin = ''.join(random.choice(allowed) for _ in range(7))
    return fin

# EN: Sentence templates with FIN placeholder
# AZ: Fƒ∞N kodu √º√ß√ºn c√ºml…ô ≈üablonlarƒ±
fin_templates = [
    "M…ônim Fƒ∞N kodum {}-dur.",
    "Fƒ∞N kodu: {}",
    "Z…ôhm…ôt olmasa, Fƒ∞N kodunuzu daxil edin: {}",
    "Sizin Fƒ∞N kodunuz {}-dir?",
    "Qeydiyyat √º√ß√ºn t…ôl…ôb olunan Fƒ∞N kodu: {}",
    "Formada yazƒ±lan Fƒ∞N kodu {}-dir.",
    "∆èlav…ô m…ôlumat √º√ß√ºn Fƒ∞N kodu: {}",
    "Profilinizd…ô qeyd olunan Fƒ∞N kodu: {}",
    "Fƒ∞N kodunuzu yoxlamaq √º√ß√ºn {} daxil edin.",
    "Fƒ∞N kodu olmadan qeydiyyat m√ºmk√ºn deyil: {}",
    "Sistemd…ô qeydiyyatdan ke√ßm…ôk √º√ß√ºn Fƒ∞N kodunuzu daxil edin: {}",
    "Sizin ≈ü…ôxsiyy…ôtinizi t…ôsdiql…ôy…ôn Fƒ∞N kod: {}",
    "Fƒ∞N kodunuz bir daha t…ôsdiql…ônir: {}",
    "A≈üaƒüƒ±da g√∂st…ôril…ôn Fƒ∞N kodunu yoxlayƒ±n: {}",
    "S…ôn…ôd m…ôlumatlarƒ±: Fƒ∞N kodu - {}",
    "Qeydiyyat formasƒ±nda {} Fƒ∞N kodunu yazƒ±n.",
    "M…ôlumat bazasƒ±nda saxlanƒ±lan Fƒ∞N kodu: {}",
    "Sizd…ôn t…ôl…ôb olunan Fƒ∞N kodu: {}",
    "Fƒ∞N kodu olmadan …ôm…ôliyyat davam etmir: {}",
    "Sistem…ô giri≈ü √º√ß√ºn {} Fƒ∞N kodunu daxil edin.",
    "S…ôn…ôdin √ºz…ôrind…ô yazƒ±lmƒ±≈ü Fƒ∞N kodu {}-dir.",
    "Fƒ∞N kodunuzu t…ôsdiql…ôyin: {}",
    "Fƒ∞N kodu sah…ôsin…ô {} yazƒ±n.",
    "≈û…ôxsiyy…ôti t…ôsdiql…ôm…ôk √º√ß√ºn {} Fƒ∞N kodunu daxil edin.",
    "Sizin √º√ß√ºn yaradƒ±lmƒ±≈ü Fƒ∞N kodu: {}",
    "S…ôn…ôd…ô …ôlav…ô olunan Fƒ∞N kodu: {}",
    "Fƒ∞N kodu t…ôl…ôb olunduqda {} t…ôqdim edin.",
    "Sistemd…ô m√∂vcud Fƒ∞N kodu: {}",
    "Fƒ∞N kodunu d…ôyi≈üm…ôk √º√ß√ºn k√∂hn…ô kod: {}",
    "∆èg…ôr Fƒ∞N kodunuz {}-dirs…ô, davam edin.",
    "Fƒ∞N kodunuzu unutmusunuzsa, yeni kod alƒ±n: {}",
    "T…ôsdiql…ônmi≈ü Fƒ∞N kodu: {}",
    "Yeni qeydiyyat √º√ß√ºn Fƒ∞N kodu: {}",
    "Fƒ∞N kodu olmadan qeydiyyat m√ºmk√ºn deyil, kodunuz: {}",
    "A≈üaƒüƒ±da g√∂st…ôril…ôn {} kodu sizin Fƒ∞N kodunuzdur.",
    "Fƒ∞N kodu il…ô baƒülƒ± sualƒ±nƒ±z varsa, kod: {}",
    "M…ôlumat formasƒ±nda Fƒ∞N kodu: {}",
    "Sistem…ô daxil olmaq √º√ß√ºn Fƒ∞N kodunuzu {} yazƒ±n.",
    "Fƒ∞N kodunuzun d√ºzg√ºnl√ºy√ºn√º yoxlayƒ±n: {}",
    "≈û…ôxsiyy…ôt v…ôsiq…ôsi v…ô Fƒ∞N kodu: {}",
    "Fƒ∞N kodu olmadan s…ôn…ôd q…ôbul edilmir: {}",
    "Fƒ∞N kodu qutusuna {} yazƒ±n.",
    "Fƒ∞N kodu s…ôhvdirs…ô, yenisini daxil edin: {}",
    "H…ôr hansƒ± …ôm…ôliyyat √º√ß√ºn Fƒ∞N kodu {} m√ºtl…ôqdir.",
    "M…ôlumatlarƒ±nƒ±zƒ±n t…ôhl√ºk…ôsizliyi √º√ß√ºn Fƒ∞N kodunuzu {} il…ô t…ôsdiql…ôyin.",
    "Daxil etdiyiniz Fƒ∞N kodu {} m…ôlumatlarƒ±mƒ±zla uyƒüun g…ôlmir.",
    "Bank xidm…ôtl…ôrind…ôn istifad…ô √º√ß√ºn Fƒ∞N kodunuz: {}",
    "≈û…ôxsi m…ôlumatlarƒ±nƒ±zƒ± yoxlamaq √º√ß√ºn {} Fƒ∞N kodunuzu daxil edin.",
    "Fƒ∞N kodu {} il…ô baƒülƒ± b√ºt√ºn s…ôn…ôdl…ôr q…ôbul edildi.",
    "ƒ∞stifad…ô√ßinin Fƒ∞N kodu {} olaraq qeyd olundu.",
    "Bu s…ôn…ôd…ô uyƒüun Fƒ∞N kodu {}-dir.",
    "Yeni bank kartƒ± almaq √º√ß√ºn {} Fƒ∞N kodu t…ôl…ôb olunur.",
    "Fƒ∞N kodu {} olan ≈ü…ôxs sistem…ô daxil oldu.",
    "Sistemd…ô {} Fƒ∞N kodu il…ô axtarƒ±≈ü aparƒ±ldƒ±.",
    "{} Fƒ∞N kodu il…ô √∂d…ôni≈ü uƒüurla tamamlandƒ±.",
    "Vergi √∂d…ôni≈üi √º√ß√ºn Fƒ∞N kodunuzu {} yazƒ±n.",
    "Maliyy…ô …ôm…ôliyyatlarƒ±nƒ± yoxlamaq √º√ß√ºn {} Fƒ∞N kodunu t…ôqdim edin.",
]

def generate_fin_sentences(n=500):
    """
    EN: Generates sentences with random FIN codes
    AZ: T…ôsad√ºfi Fƒ∞N kodlarƒ± il…ô c√ºml…ôl…ôr yaradƒ±r
    """
    sentences = []
    for _ in range(n):
        template = random.choice(fin_templates)
        fin_code = random_fin()
        sentence = template.format(fin_code)
        sentences.append(sentence)
    return sentences

if __name__ == "__main__":
    # EN: Generate 30,000 FIN sentences and save to file
    # AZ: 30,000 Fƒ∞N kodlu c√ºml…ô yaradƒ±b fayla yazƒ±n
    sentences = generate_fin_sentences(30000)
    with open("fin_numune.txt", "w", encoding="utf-8") as f:
        for sentence in sentences:
            f.write(sentence + "\n")

In [None]:
import re
import json

# EN: FIN code pattern - 7 characters, uppercase letters (excluding "I" and "O") and digits
# AZ: Fƒ∞N kodu pattern-i - 7 simvol, b√∂y√ºk h…ôrf v…ô r…ôq…ôm, "I" v…ô "O" istisna
# EN: Pattern: [A-HJ-NP-Z0-9]{7} (A-Z, except I and O, and 0-9)
# AZ: Pattern: [A-HJ-NP-Z0-9]{7} (A-Z, amma I v…ô O olmadan, 0-9)
pattern_fin = r'\b[A-HJ-NP-Z0-9]{7}\b'
label = "FIN_CODE"  # EN: Entity label for FIN code | AZ: Fƒ∞N kod √º√ß√ºn entiti label

def label_sentences(sentences):
    """
    EN: Finds and labels FIN codes in sentences with their positions
    AZ: C√ºml…ôl…ôrd…ô Fƒ∞N kodlarƒ±nƒ±n m√∂vqel…ôrini tapƒ±r v…ô annotate edir
    """
    labeled_data = []
    for sentence in sentences:
        entities = []
        # EN: Find all FIN code matches in the sentence
        # AZ: C√ºml…ôd…ô b√ºt√ºn Fƒ∞N kod uyƒüunluqlarƒ±nƒ± tap
        for match in re.finditer(pattern_fin, sentence):
            start, end = match.start(), match.end()
            entities.append((start, end, label))
        # EN: If any FIN code is found, add to labeled data
        # AZ: ∆èg…ôr Fƒ∞N kod tapƒ±lƒ±bsa, annotate edilmi≈ü veril…ônl…ôr…ô …ôlav…ô et
        if entities:
            labeled_data.append((sentence, {"entities": entities}))
    return labeled_data

# EN: Read sentences from file
# AZ: Fayldan c√ºml…ôl…ôri oxu
with open("fin_numune.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f.readlines()]

# EN: Label sentences with FIN codes
# AZ: C√ºml…ôl…ôri Fƒ∞N kodlarƒ± il…ô annotate et
labeled_data = label_sentences(sentences)

# EN: Write annotation data to JSON file
# AZ: Annotasiya m…ôlumatƒ±nƒ± JSON-a yaz
with open("fin_annotated.json", "w", encoding="utf-8") as f:
    json.dump(labeled_data, f, ensure_ascii=False, indent=2)

In [None]:
!rm -r /content/sample_data /content/car_plate_dataset.csv /content/car_plate_dataset.txt /content/fin_numune.txt /content/vesiqe_numune.txt

In [None]:
# EN: Imports
# AZ: Import-lar
import json
import torch
import numpy as np
import pandas as pd
import re
from typing import List, Tuple, Dict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')  # EN: Ignore warnings | AZ: Warning-larƒ± gizl…ôt

print("‚úÖ B√ºt√ºn kitabxanalar y√ºkl…ôndi")  # EN: All libraries loaded | AZ: B√ºt√ºn kitabxanalar y√ºkl…ôndi
print(f"üî• PyTorch versiyasƒ±: {torch.__version__}")  # EN: PyTorch version | AZ: PyTorch versiyasƒ±
print(f"üöÄ CUDA m√∂vcuddur: {torch.cuda.is_available()}")  # EN: CUDA available | AZ: CUDA m√∂vcuddur

# EN: Set device (GPU if available, otherwise CPU)
# AZ: Device ayarla (…ôg…ôr GPU varsa, GPU, yoxdursa CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üíª ƒ∞stifad…ô edil…ôc…ôk device: {device}")  # EN: Device to be used | AZ: ƒ∞stifad…ô edil…ôc…ôk device

‚úÖ B√ºt√ºn kitabxanalar y√ºkl…ôndi
üî• PyTorch versiyasƒ±: 2.8.0+cu126
üöÄ CUDA m√∂vcuddur: True
üíª ƒ∞stifad…ô edil…ôc…ôk device: cuda


In [None]:
# EN: Model and training configuration
# AZ: Model v…ô training konfiqurasiyasƒ±
MODEL_NAME = "bert-base-multilingual-cased"
MAX_LEN = 128
BATCH_SIZE = 16
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5

# EN: Label system - based on your dataset's entities
# AZ: Label sistemi - datasetinizd…ôn √ßƒ±xan n…ôtic…ôl…ôr…ô g√∂r…ô
LABELS = [
    'O',           # EN: Outside | AZ: K…ônar (etiketl…ônmi≈ü entitiy olmayan hiss…ô)
    'B-PLATE',     # EN: Beginning of Car Plate | AZ: Avtomobil n√∂mr…ôsinin ba≈ülanƒüƒ±cƒ±
    'I-PLATE',     # EN: Inside Car Plate | AZ: Avtomobil n√∂mr…ôsinin i√ßi (davamƒ±)
    'B-FIN',       # EN: Beginning of FIN Code | AZ: Fƒ∞N kodunun ba≈ülanƒüƒ±cƒ±
    'I-FIN',       # EN: Inside FIN Code | AZ: Fƒ∞N kodunun i√ßi (davamƒ±)
    'B-ID',        # EN: Beginning of ID Number | AZ: ≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsinin ba≈ülanƒüƒ±cƒ±
    'I-ID'         # EN: Inside ID Number | AZ: ≈û…ôxsiyy…ôt v…ôsiq…ô n√∂mr…ôsinin i√ßi (davamƒ±)
]

# EN: Label mapping (label to id and id to label dictionaries)
# AZ: Label mapping (etiket-id v…ô id-etiket dictionary-l…ôri)
label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {i: label for i, label in enumerate(LABELS)}

print("üìã Konfiqurasiya:")  # EN: Configuration | AZ: Konfiqurasiya
print(f"Model: {MODEL_NAME}")
print(f"Max Length: {MAX_LEN}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Labels: {LABELS}")

üìã Konfiqurasiya:
Model: bert-base-multilingual-cased
Max Length: 128
Batch Size: 16
Epochs: 3
Learning Rate: 2e-05
Labels: ['O', 'B-PLATE', 'I-PLATE', 'B-FIN', 'I-FIN', 'B-ID', 'I-ID']


In [None]:
def validate_entity_format(text: str, entity_type: str) -> bool:
    """
    EN: Validates the format of an entity based on its type
    AZ: Entity-nin formatƒ±nƒ± doƒürulayƒ±r

    EN: Format rules:
    AZ: Format qaydalarƒ±:
    - FIN code: 7 characters (letters + digits)
      EN: Example: AZEDF12
    - ID/AA: AA + 7 digits (9 characters)
      EN: Example: AA1234567
    - ID/AZE: AZE + 9 digits (12 characters)
      EN: Example: AZE123456789
    - Car Plate: XX-YY-ZZZ format
      EN: Example: 90-AB-123
    """
    if entity_type == 'FIN':
        # EN: FIN code must be 7 characters (letters or digits)
        # AZ: FIN kod 7 simvol olmalƒ±dƒ±r (h…ôrf v…ô r…ôq…ôm qarƒ±≈üƒ±ƒüƒ±)
        return len(text) == 7 and re.match(r'^[A-Z0-9]{7}$', text)

    elif entity_type == 'ID':
        # EN: Serial number rules
        # AZ: Seriya n√∂mr…ôsi qaydalarƒ±
        if text.startswith('AA'):
            # EN: If starts with AA, must be 9 characters (AA + 7 digits)
            # AZ: AA il…ô ba≈ülayƒ±rsa 9 simvol (AA + 7 r…ôq…ôm)
            return len(text) == 9 and re.match(r'^AA\d{7}$', text)
        elif text.startswith('AZE'):
            # EN: If starts with AZE, must be 12 characters (AZE + 9 digits)
            # AZ: AZE il…ô ba≈ülayƒ±rsa 12 simvol (AZE + 9 r…ôq…ôm)
            return len(text) == 12 and re.match(r'^AZE\d{9}$', text)
        else:
            # EN: For other serials, general rule (2-3 letters + 6-9 digits)
            # AZ: Dig…ôr seriya n√∂mr…ôl…ôri √º√ß√ºn √ºmumi qayda
            return 8 <= len(text) <= 12 and re.match(r'^[A-Z]{2,3}\d{6,9}$', text)

    elif entity_type == 'PLATE':
        # EN: Car plate should be in XX-YY-ZZZ format
        # AZ: Avtomobil n√∂mr…ôsi XX-YY-ZZZ formatƒ±nda olmalƒ±dƒ±r
        return len(text) == 9 and re.match(r'^\d{2}-[A-Z]{2}-\d{3}$', text)

    return False

def show_validation_rules():
    """
    EN: Displays the format validation rules for entities
    AZ: Format qaydalarƒ±nƒ± g√∂st…ôrir
    """
    print("üìè Entity Format Qaydalarƒ±:")
    print("üîπ FIN kod: 7 simvol (h…ôrf+r…ôq…ôm) - m…ôs…ôl…ôn: AZEDF12")
    print("üîπ ID/AA: 9 simvol (AA + 7 r…ôq…ôm) - m…ôs…ôl…ôn: AA1234567")
    print("üîπ ID/AZE: 12 simvol (AZE + 9 r…ôq…ôm) - m…ôs…ôl…ôn: AZE123456789")
    print("üîπ Avtomobil: XX-YY-ZZZ formatƒ± - m…ôs…ôl…ôn: 90-AB-123")

show_validation_rules()

üìè Entity Format Qaydalarƒ±:
üîπ FIN kod: 7 simvol (h…ôrf+r…ôq…ôm) - m…ôs…ôl…ôn: AZEDF12
üîπ ID/AA: 9 simvol (AA + 7 r…ôq…ôm) - m…ôs…ôl…ôn: AA1234567
üîπ ID/AZE: 12 simvol (AZE + 9 r…ôq…ôm) - m…ôs…ôl…ôn: AZE123456789
üîπ Avtomobil: XX-YY-ZZZ formatƒ± - m…ôs…ôl…ôn: 90-AB-123


In [None]:
def convert_spacy_to_bio(text: str, entities: list) -> Tuple[List[str], List[str]]:
    """
    EN: Converts spaCy format entities to BIO format labels
    AZ: spaCy formatƒ±ndan BIO formatƒ±na √ßevirm…ô
    """
    tokens = text.split()
    labels = ['O'] * len(tokens)

    # EN: Calculate token positions (character indices in text)
    # AZ: Token-larƒ±n m√∂vqel…ôrini hesablayƒ±rƒ±q
    token_positions = []
    current_pos = 0

    for token in tokens:
        start_pos = text.find(token, current_pos)
        end_pos = start_pos + len(token)
        token_positions.append((start_pos, end_pos))
        current_pos = end_pos

    # EN: Convert entities to BIO format
    # AZ: Entity-l…ôri BIO formatƒ±na √ßeviririk
    for start_char, end_char, entity_type in entities:
        # EN: Map entity type to BIO label
        # AZ: Entity tipini bizim formatƒ±mƒ±za uyƒüunla≈üdƒ±rƒ±rƒ±q
        if entity_type == "CAR_PLATE":
            bio_label = "PLATE"
        elif entity_type == "FIN_CODE":
            bio_label = "FIN"
        elif entity_type == "ID_NUMBER":
            bio_label = "ID"
        else:
            continue

        # EN: Find tokens that overlap with entity span
        # AZ: Hansƒ± token-larƒ±n entity-y…ô aid olduƒüunu tapƒ±rƒ±q
        entity_tokens = []
        for i, (token_start, token_end) in enumerate(token_positions):
            if token_start < end_char and token_end > start_char:
                entity_tokens.append(i)

        # EN: Set BIO labels for entity tokens
        # AZ: BIO etiketl…ôri t…ôyin edirik
        for i, token_idx in enumerate(entity_tokens):
            if i == 0:
                labels[token_idx] = f"B-{bio_label}"
            else:
                labels[token_idx] = f"I-{bio_label}"

    return tokens, labels

def load_spacy_json_from_path(file_path: str):
    """
    EN: Loads a spaCy-format JSON dataset from the given path
    AZ: Veril…ôn fayl yolundan spaCy formatƒ±nda JSON dataseti y√ºkl…ôyir
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        texts = []
        labels = []

        for item in data:
            text = item[0]  # EN: Sentence | AZ: C√ºml…ô
            annotations = item[1]  # EN: Annotation | AZ: Annotasiyalar
            entities = annotations.get('entities', [])

            tokens, bio_labels = convert_spacy_to_bio(text, entities)
            texts.append(tokens)
            labels.append(bio_labels)

        print(f"‚úÖ {file_path} uƒüurla y√ºkl…ôndi: {len(texts)} n√ºmun…ô")

        return texts, labels

    except FileNotFoundError:
        print(f"‚ùå Fayl tapƒ±lmadƒ±: {file_path}")
        return [], []
    except Exception as e:
        print(f"‚ùå X…ôta ba≈ü verdi {file_path}: {e}")
        return [], []

def load_datasets_from_paths(file_paths: List[str]):
    """
    EN: Loads multiple spaCy-format datasets from given file paths
    AZ: Veril…ôn fayl yollarƒ±ndan datasetl…ôri y√ºkl…ôyir
    """
    datasets = []

    for i, file_path in enumerate(file_paths, 1):
        print(f"\nüìÅ Dataset {i} y√ºkl…ônir: {file_path}")
        texts, labels = load_spacy_json_from_path(file_path)

        if texts:
            datasets.append((texts, labels))
            print(f"‚úÖ Dataset {i} y√ºkl…ôndi: {len(texts)} n√ºmun…ô")
        else:
            print(f"‚ùå Dataset {i} y√ºkl…ônm…ôdi")

    return datasets

def combine_datasets(datasets: List[Tuple[List[List[str]], List[List[str]]]]):
    """
    EN: Combines multiple token/label datasets into one
    AZ: Datasetl…ôri birl…ô≈üdirir
    """
    all_texts = []
    all_labels = []

    for i, (texts, labels) in enumerate(datasets, 1):
        print(f"Dataset {i}: {len(texts)} n√ºmun…ô")
        all_texts.extend(texts)
        all_labels.extend(labels)

    print(f"üìä √úmumi: {len(all_texts)} n√ºmun…ô")
    return all_texts, all_labels

print("‚úÖ Dataset y√ºkl…ôm…ô funksiyalarƒ± hazƒ±rlandƒ±")

‚úÖ Dataset y√ºkl…ôm…ô funksiyalarƒ± hazƒ±rlandƒ±


In [None]:
def analyze_dataset(texts: List[List[str]], labels: List[List[str]]):
    """
    EN: Analyzes the dataset and prints statistics
    AZ: Dataset analizi v…ô statistika
    """
    label_counts = {}
    total_tokens = 0
    entity_examples = {}

    for text_tokens, label_seq in zip(texts, labels):
        total_tokens += len(label_seq)

        # EN: Collect entity examples
        # AZ: Entity n√ºmun…ôl…ôrini topla
        i = 0
        while i < len(text_tokens):
            label = label_seq[i] if i < len(label_seq) else 'O'

            if label.startswith('B-'):
                entity_type = label[2:]
                entity_tokens = [text_tokens[i]]
                j = i + 1

                while (j < len(text_tokens) and
                       j < len(label_seq) and
                       label_seq[j] == f'I-{entity_type}'):
                    entity_tokens.append(text_tokens[j])
                    j += 1

                entity_text = ''.join(entity_tokens)
                if entity_type not in entity_examples:
                    entity_examples[entity_type] = []
                if len(entity_examples[entity_type]) < 3:
                    entity_examples[entity_type].append(entity_text)
                i = j
            else:
                i += 1

        # EN: Count label frequencies
        # AZ: Label sayƒ±nƒ± hesabla
        for label in label_seq:
            label_counts[label] = label_counts.get(label, 0) + 1

    print("\nüìà Dataset Statistikasƒ±:")
    print(f"√úmumi c√ºml…ô sayƒ±: {len(texts)}")
    print(f"√úmumi token sayƒ±: {total_tokens}")
    print("\nüè∑Ô∏è Label paylanmasƒ±:")

    for label, count in sorted(label_counts.items()):
        percentage = (count / total_tokens) * 100
        print(f"{label}: {count} ({percentage:.1f}%)")

    print("\nüìù Entity n√ºmun…ôl…ôri:")
    for entity_type, examples in entity_examples.items():
        print(f"{entity_type}: {', '.join(examples[:3])}")

def validate_dataset_entities(texts: List[List[str]], labels: List[List[str]]):
    """
    EN: Validates entities in the dataset using format rules
    AZ: Dataset-d…ô olan entity-l…ôri validation qaydalarƒ±na g√∂r…ô yoxlayƒ±r
    """
    valid_entities = {'FIN': 0, 'ID': 0, 'PLATE': 0}
    invalid_entities = {'FIN': 0, 'ID': 0, 'PLATE': 0}

    for text_tokens, label_seq in zip(texts, labels):
        i = 0
        while i < len(text_tokens):
            label = label_seq[i] if i < len(label_seq) else 'O'

            if label.startswith('B-'):
                entity_type = label[2:]
                entity_tokens = [text_tokens[i]]
                j = i + 1

                while (j < len(text_tokens) and
                       j < len(label_seq) and
                       label_seq[j] == f'I-{entity_type}'):
                    entity_tokens.append(text_tokens[j])
                    j += 1

                entity_text = ''.join(entity_tokens)

                if validate_entity_format(entity_text, entity_type):
                    valid_entities[entity_type] += 1
                else:
                    invalid_entities[entity_type] += 1

                i = j
            else:
                i += 1

    print("\n‚úÖ Validation N…ôtic…ôl…ôri:")
    for entity_type in valid_entities:
        total = valid_entities[entity_type] + invalid_entities[entity_type]
        if total > 0:
            valid_percent = (valid_entities[entity_type] / total) * 100
            print(f"{entity_type}: {valid_entities[entity_type]}/{total} valid ({valid_percent:.1f}%)")

print("‚úÖ Dataset analiz funksiyalarƒ± hazƒ±rlandƒ±")

‚úÖ Dataset analiz funksiyalarƒ± hazƒ±rlandƒ±


In [None]:
# EN: File paths for datasets
# AZ: Fayl Yollarƒ±
dataset_paths = [
    "/content/car_plate_dataset_annotated.json",    # EN: 1st dataset | AZ: 1-ci dataset
    "/content/fin_annotated.json",                  # EN: 2nd dataset | AZ: 2-ci dataset
    "/content/vesiqe_annotated.json"                # EN: 3rd dataset | AZ: 3-c√º dataset
]

print("üìÇ Datasetl…ôr y√ºkl…ônir...")  # EN: Loading datasets...
print("Fayl yollarƒ±:")              # EN: File paths:
for i, path in enumerate(dataset_paths, 1):
    print(f"  {i}. {path}")

# EN: Load datasets from paths
# AZ: Datasetl…ôri y√ºkl…ô
datasets = load_datasets_from_paths(dataset_paths)

if datasets:
    # EN: Combine datasets into one
    # AZ: Datasetl…ôri birl…ô≈üdir
    all_texts, all_labels = combine_datasets(datasets)

    # EN: Analyze dataset statistics
    # AZ: Dataset analizini et
    analyze_dataset(all_texts, all_labels)

    # EN: Entity validation check
    # AZ: Validation yoxlamasƒ±
    validate_dataset_entities(all_texts, all_labels)

    # EN: Train-validation split (80% train, 20% validation)
    # AZ: Train-validation split (80% train, 20% validation)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        all_texts, all_labels, test_size=0.2, random_state=42
    )

    print(f"\nüìã Data Split:")
    print(f"Train: {len(train_texts)} n√ºmun…ô")
    print(f"Validation: {len(val_texts)} n√ºmun…ô")

    print("‚úÖ Datasetl…ôr uƒüurla hazƒ±rlandƒ±!")
else:
    print("‚ùå He√ß bir dataset y√ºkl…ônm…ôdi!")
    print("Fayl yollarƒ±nƒ± yoxlayƒ±n v…ô yenid…ôn c…ôhd edin.")

üìÇ Datasetl…ôr y√ºkl…ônir...
Fayl yollarƒ±:
  1. /content/car_plate_dataset_annotated.json
  2. /content/fin_annotated.json
  3. /content/vesiqe_annotated.json

üìÅ Dataset 1 y√ºkl…ônir: /content/car_plate_dataset_annotated.json
‚úÖ /content/car_plate_dataset_annotated.json uƒüurla y√ºkl…ôndi: 30000 n√ºmun…ô
‚úÖ Dataset 1 y√ºkl…ôndi: 30000 n√ºmun…ô

üìÅ Dataset 2 y√ºkl…ônir: /content/fin_annotated.json
‚úÖ /content/fin_annotated.json uƒüurla y√ºkl…ôndi: 30000 n√ºmun…ô
‚úÖ Dataset 2 y√ºkl…ôndi: 30000 n√ºmun…ô

üìÅ Dataset 3 y√ºkl…ônir: /content/vesiqe_annotated.json
‚úÖ /content/vesiqe_annotated.json uƒüurla y√ºkl…ôndi: 30000 n√ºmun…ô
‚úÖ Dataset 3 y√ºkl…ôndi: 30000 n√ºmun…ô
Dataset 1: 30000 n√ºmun…ô
Dataset 2: 30000 n√ºmun…ô
Dataset 3: 30000 n√ºmun…ô
üìä √úmumi: 90000 n√ºmun…ô

üìà Dataset Statistikasƒ±:
√úmumi c√ºml…ô sayƒ±: 90000
√úmumi token sayƒ±: 576351

üè∑Ô∏è Label paylanmasƒ±:
B-FIN: 30000 (5.2%)
B-ID: 30000 (5.2%)
B-PLATE: 30000 (5.2%)
O: 486351 (84.4%)

üìù Entity n√

In [None]:
# EN: Load tokenizer and define NERDataset class
# AZ: Tokenizer v…ô model y√ºkl…ôm…ô

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"‚úÖ Tokenizer y√ºkl…ôndi: {MODEL_NAME}")  # EN: Tokenizer loaded

class NERDataset(Dataset):
    """
    EN: Custom PyTorch Dataset for NER tasks with BIO labels
    AZ: BIO etiketli NER tap≈üƒ±rƒ±qlarƒ± √º√ß√ºn PyTorch Dataset sinifi
    """
    def __init__(self, texts: List[List[str]], labels: List[List[str]], tokenizer, max_len: int):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        # EN: Tokenize the text (split into words)
        # AZ: Tokenl…ô≈üdirm…ô (s√∂zl…ôr √ºzr…ô split)
        encoding = self.tokenizer(
            text,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        # EN: Align labels with tokens (using word_ids)
        # AZ: Label-larƒ± token-lara uyƒüunla≈üdƒ±rma
        word_ids = encoding.word_ids()
        label_ids = []

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # EN: Special tokens (CLS, SEP, PAD etc.)
            elif word_id < len(labels):
                label_ids.append(label2id[labels[word_id]])
            else:
                label_ids.append(label2id['O'])

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

print("‚úÖ NERDataset sinifi hazƒ±rlandƒ±")  # EN: NERDataset class is ready

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

‚úÖ Tokenizer y√ºkl…ôndi: bert-base-multilingual-cased
‚úÖ NERDataset sinifi hazƒ±rlandƒ±


In [None]:
# EN: Load model and set training configuration
# AZ: Model y√ºkl…ô v…ô training konfiqurasiyasƒ±

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    label2id=label2id,
    id2label=id2label
)
model.to(device)

print(f"‚úÖ Model y√ºkl…ôndi: {MODEL_NAME}")        # EN: Model loaded
print(f"üìä Label sayƒ±: {len(LABELS)}")          # EN: Number of labels

# EN: Metrics calculation function for Trainer
# AZ: Metrics hesablama funksiyasƒ±
def compute_metrics(eval_pred):
    """
    EN: Computes metrics during training
    AZ: Training zamanƒ± metrikl…ôr hesablamaq √º√ß√ºn
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    # EN: Remove labels with -100 (special tokens)
    # AZ: -100 olan labellarƒ± √ßƒ±xar (special tokens)
    true_predictions = [
        [id2label[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # EN: Flatten lists
    # AZ: Flatten etm…ôk
    flat_true_labels = [label for sublist in true_labels for label in sublist]
    flat_predictions = [pred for sublist in true_predictions for pred in sublist]

    # EN: Calculate metrics
    # AZ: Metrics hesablamaq
    precision, recall, f1, _ = precision_recall_fscore_support(
        flat_true_labels, flat_predictions, average='weighted', zero_division=0
    )
    accuracy = accuracy_score(flat_true_labels, flat_predictions)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# EN: Training arguments for 300 steps
# AZ: Training argumentl…ôri - 300 step √º√ß√ºn
training_args = TrainingArguments(
    output_dir='./results',
    max_steps=300,                    # EN: 300 steps limit | AZ: 300 step m…ôhdudiyy…ôti
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=50,                  # EN: Less warmup (1/6 of 300) | AZ: Daha az warmup (300 step-in 1/6-sƒ±)
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=25,                 # EN: More frequent logging | AZ: Daha tez-tez log (300/12)
    eval_strategy="steps",      # EN: Use evaluation_strategy if needed
    eval_steps=50,                    # EN: More frequent eval | AZ: Daha tez-tez eval (300/6)
    save_strategy="steps",
    save_steps=50,                    # EN: More frequent save | AZ: Daha tez-tez save
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None,
    learning_rate=LEARNING_RATE,
)

print(f"Max steps: 300")
print(f"Warmup steps: 50")
print(f"Logging every: 25 steps")
print(f"Evaluation every: 50 steps")
print(f"Save every: 50 steps")

print("‚úÖ Training konfiqurasiyasƒ± hazƒ±rlandƒ±")

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model y√ºkl…ôndi: bert-base-multilingual-cased
üìä Label sayƒ±: 7
Max steps: 300
Warmup steps: 50
Logging every: 25 steps
Evaluation every: 50 steps
Save every: 50 steps
‚úÖ Training konfiqurasiyasƒ± hazƒ±rlandƒ±


In [None]:
# EN: Create dataset objects and start training if train_texts are loaded
# AZ: Dataset obyektl…ôrini yarat v…ô train ba≈ülat, …ôg…ôr train_texts m√∂vcuddursa

if 'train_texts' in locals() and len(train_texts) > 0:
    # EN: Create train and validation NERDataset objects
    # AZ: Train v…ô validation NERDataset obyektl…ôri yarat
    train_dataset = NERDataset(
        texts=train_texts,
        labels=train_labels,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    val_dataset = NERDataset(
        texts=val_texts,
        labels=val_labels,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    # EN: Data collator for token classification
    # AZ: Token classification √º√ß√ºn data collator
    data_collator = DataCollatorForTokenClassification(
        tokenizer=tokenizer,
        padding=True,
        max_length=MAX_LEN,
        pad_to_multiple_of=None,
        return_tensors="pt"
    )

    # EN: Create Trainer object
    # AZ: Trainer obyektini yarat
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"‚úÖ Train dataset: {len(train_dataset)} n√ºmun…ô")          # EN: Train dataset samples
    print(f"‚úÖ Validation dataset: {len(val_dataset)} n√ºmun…ô")      # EN: Validation dataset samples
    print("‚úÖ Trainer hazƒ±rlandƒ±")                                  # EN: Trainer is ready

    # EN: Start training process
    # AZ: Training ba≈ülat
    print("\nüöÄ Training ba≈ülayƒ±r...")
    print(f"üìä Total training samples: {len(train_dataset)}")
    print(f"üìä Total validation samples: {len(val_dataset)}")
    print(f"üî• Device: {device}")

    try:
        trainer.train()
        print("‚úÖ Training tamamlandƒ±!")                             # EN: Training completed

        # EN: Evaluate the model
        # AZ: Model qiym…ôtl…ôndirm…ô
        print("\nüìä Model qiym…ôtl…ôndirilir...")
        eval_results = trainer.evaluate()

        print("üéØ Evaluation n…ôtic…ôl…ôri:")                          # EN: Evaluation results
        for key, value in eval_results.items():
            if isinstance(value, float):
                print(f"{key}: {value:.4f}")
            else:
                print(f"{key}: {value}")

        # EN: Save best model and tokenizer
        # AZ: En yax≈üƒ± modeli saxla
        trainer.save_model("./best_model")
        tokenizer.save_pretrained("./best_model")
        print("‚úÖ Model ./best_model qovluƒüunda saxlanƒ±ldƒ±")        # EN: Model saved in ./best_model

    except Exception as e:
        print(f"‚ùå Training zamanƒ± x…ôta: {e}")                      # EN: Error during training

else:
    print("‚ùå Datasetl…ôr hazƒ±rlanmadƒ±, training edil…ô bilm…ôz!")     # EN: Datasets not ready, training cannot be performed
    print("∆èvv…ôlki hiss…ôl…ôrd…ô datasetl…ôri d√ºzg√ºn y√ºkl…ôyin.")        # EN: Make sure datasets are loaded in previous steps

‚úÖ Train dataset: 72000 n√ºmun…ô
‚úÖ Validation dataset: 18000 n√ºmun…ô
‚úÖ Trainer hazƒ±rlandƒ±

üöÄ Training ba≈ülayƒ±r...
üìä Total training samples: 72000
üìä Total validation samples: 18000
üî• Device: cuda


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbahramzada[0m ([33mbahramzada-unec-business-school[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.1655,0.015308,0.998266,0.998264,0.99827,0.998266
100,0.0043,0.000867,0.999983,0.999983,0.999983,0.999983
150,0.0011,0.000642,0.999966,0.999966,0.999966,0.999966
200,0.0009,0.000385,1.0,1.0,1.0,1.0
250,0.0007,0.000341,1.0,1.0,1.0,1.0
300,0.0007,0.000326,1.0,1.0,1.0,1.0


‚úÖ Training tamamlandƒ±!

üìä Model qiym…ôtl…ôndirilir...


üéØ Evaluation n…ôtic…ôl…ôri:
eval_loss: 0.0004
eval_accuracy: 1.0000
eval_f1: 1.0000
eval_precision: 1.0000
eval_recall: 1.0000
eval_runtime: 125.3216
eval_samples_per_second: 143.6300
eval_steps_per_second: 8.9770
epoch: 0.0667
‚úÖ Model ./best_model qovluƒüunda saxlanƒ±ldƒ±


In [None]:
def predict_entities_with_model(text: str, model, tokenizer, device):
    """
    EN: Predicts entities in text using model
    AZ: Modeli istifad…ô ed…ôr…ôk entity-l…ôri predict edir
    """
    tokens = text.split()

    # EN: Encode with tokenizer
    # AZ: Tokenizer il…ô encode et
    inputs = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    )

    # EN: Send to GPU if available
    # AZ: GPU-ya g√∂nd…ôr
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # EN: Model prediction
    # AZ: Model prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

    # EN: Get word IDs for mapping tokens
    # AZ: Word IDs il…ô token-word mapping
    word_ids = inputs.word_ids() if hasattr(inputs, 'word_ids') else None
    predicted_labels = []

    if word_ids is None:
        # EN: Manual mapping if word_ids not available
        # AZ: Manual word mapping
        tokenized = tokenizer.tokenize(' '.join(tokens))
        pred_idx = 1  # EN: Start after CLS token

        for token in tokens:
            if pred_idx < len(predictions[0]):
                label_id = predictions[0][pred_idx].item()
                if label_id < len(id2label):
                    predicted_labels.append(id2label[label_id])
                else:
                    predicted_labels.append('O')

                # EN: Count subwords for token
                # AZ: Token ne√ß…ô subword-…ô b√∂l√ºn√ºb onu hesabla
                token_subwords = tokenizer.tokenize(token)
                pred_idx += len(token_subwords)
            else:
                predicted_labels.append('O')
    else:
        # EN: Use word_ids mapping
        # AZ: Word IDs istifad…ô et
        for i in range(len(tokens)):
            word_positions = [j for j, wid in enumerate(word_ids[0]) if wid == i]
            if word_positions:
                label_id = predictions[0][word_positions[0]].item()
                if label_id < len(id2label):
                    predicted_labels.append(id2label[label_id])
                else:
                    predicted_labels.append('O')
            else:
                predicted_labels.append('O')

    # EN: Fix label count to match tokens
    # AZ: Label sayƒ±nƒ± d√ºz…ôlt
    while len(predicted_labels) < len(tokens):
        predicted_labels.append('O')
    predicted_labels = predicted_labels[:len(tokens)]

    return tokens, predicted_labels

def enhanced_predict_with_validation(text: str, model, tokenizer, device):
    """
    EN: Entity prediction with format validation
    AZ: Validation qaydalarƒ± il…ô t…ôkmill…ô≈üdirilmi≈ü entity prediction
    """
    tokens, predicted_labels = predict_entities_with_model(text, model, tokenizer, device)

    validated_labels = []
    i = 0

    while i < len(tokens):
        current_label = predicted_labels[i] if i < len(predicted_labels) else 'O'

        if current_label.startswith('B-'):
            entity_type = current_label[2:]
            entity_tokens = [tokens[i]]

            # EN: Gather full entity text
            # AZ: Entity-nin tam m…ôtnini topla
            j = i + 1
            while (j < len(tokens) and
                   j < len(predicted_labels) and
                   predicted_labels[j] == f'I-{entity_type}'):
                entity_tokens.append(tokens[j])
                j += 1

            entity_text = ''.join(entity_tokens)

            # EN: Validate format
            # AZ: Format validation
            if validate_entity_format(entity_text, entity_type):
                # EN: Valid entity, keep labels
                # AZ: Valid entity - saxla
                validated_labels.append(current_label)
                for k in range(i + 1, j):
                    if k < len(predicted_labels):
                        validated_labels.append(predicted_labels[k])
                    else:
                        validated_labels.append('O')
            else:
                # EN: Invalid entity, set to 'O'
                # AZ: Invalid entity - O etiketi ver
                for k in range(i, j):
                    validated_labels.append('O')

            i = j
        else:
            validated_labels.append(current_label)
            i += 1

    # EN: Fix label count to match tokens
    # AZ: Label sayƒ±nƒ± d√ºz…ôlt
    while len(validated_labels) < len(tokens):
        validated_labels.append('O')
    validated_labels = validated_labels[:len(tokens)]

    return tokens, validated_labels

def blur_with_model_and_validation(text: str, model, tokenizer, device):
    """
    EN: Blur entities using model & validation
    AZ: Model + validation il…ô blurring edir
    """
    tokens, validated_labels = enhanced_predict_with_validation(text, model, tokenizer, device)

    blurred_tokens = []
    entities_found = []
    i = 0

    while i < len(tokens):
        current_label = validated_labels[i] if i < len(validated_labels) else 'O'

        if current_label.startswith('B-'):
            entity_type = current_label[2:]
            entity_tokens = [tokens[i]]
            entity_start = i

            # EN: Find entity continuation (I- labels)
            # AZ: I- etiketli davamƒ±nƒ± tap
            j = i + 1
            while (j < len(tokens) and
                   j < len(validated_labels) and
                   validated_labels[j] == f'I-{entity_type}'):
                entity_tokens.append(tokens[j])
                j += 1

            entity_text = ''.join(entity_tokens)

            entities_found.append({
                'text': entity_text,
                'type': entity_type,
                'start': entity_start,
                'end': j-1,
                'tokens': entity_tokens
            })

            blurred_tokens.append('[BLURRED]')
            i = j
        else:
            blurred_tokens.append(tokens[i])
            i += 1

    return ' '.join(blurred_tokens), entities_found

print("‚úÖ T…ôkmill…ô≈üdirilmi≈ü inference funksiyalarƒ± hazƒ±rlandƒ±")  # EN: Enhanced inference functions ready

‚úÖ T…ôkmill…ô≈üdirilmi≈ü inference funksiyalarƒ± hazƒ±rlandƒ±


In [None]:
def test_enhanced_model():
    """
    EN: Tests the model with validation rules on various sentences
    AZ: Validation qaydalarƒ± il…ô modeli test edir
    """
    test_sentences = [
        "M…ônim fin kodum AZEDF12 olan kartƒ±m var",          # Valid FIN (7 simvol)
        "FIN kod AB12345 m√∂vcuddur",                        # Valid FIN (7 simvol)
        "Bu 90-AB-123 n√∂mr…ôli avtomobil dostumundur",       # Valid PLATE
        "≈û…ôxsiyy…ôt v…ôsiq…ô AA1234567 dir",                   # Valid ID (AA + 7 r…ôq…ôm)
        "S…ôn…ôd n√∂mr…ôsi AZE123456789 t…ôqdim edilm…ôlidir",    # Valid ID (AZE + 9 r…ôq…ôm)
        "S…ôhv format AA12345 v…ô ya AZE12345 var",           # Invalid formats
        "Bu adi c√ºml…ô dir he√ß n…ô yoxdur",                   # He√ß n…ô yox
        "Fƒ∞N MM4NS3L v…ô ma≈üƒ±n 77-KM-596 qeydiyyatƒ± var",    # Valid entities
        'Qeydiyyat prosesind…ô ≈ü…ôxsiyy…ôt v…ôsiq…ôsi n√∂mr…ôsi AZE123456789 v…ô Fƒ∞N kod MM4NS3L t…ôqdim edilm…ôlidir. Avtomobilin n√∂mr…ôsi 77-KM-596 v…ô 10-AB-123 il…ô baƒülƒ± m…ôlumatlar da sistem…ô daxil edilm…ôlidir. ∆èlav…ô olaraq, yeni qeydiyyatda t…ôl…ôb olunan ikinci ≈ü…ôxsiyy…ôt v…ôsiq…ôsi n√∂mr…ôsi AZE987654321 v…ô Fƒ∞N kodu QA9WT2K d…ô yoxlanƒ±lacaq. Qeyd: h…ôr hansƒ± bir kod s…ôhv daxil edil…ôrs…ô, sistem x…ôb…ôrdarlƒ±q ed…ôc…ôk.' #custom
    ]

    print("üß™ Model + Validation Test...\n")

    for i, sentence in enumerate(test_sentences, 1):
        print(f"üìù Test {i}:")
        print(f"Orijinal: {sentence}")

        try:
            blurred_text, entities = blur_with_model_and_validation(sentence, model, tokenizer, device)
            print(f"Blurred:  {blurred_text}")

            if entities:
                print("‚úÖ Tapƒ±lan valid entity-l…ôr:")
                for entity in entities:
                    validation_result = "‚úì Valid" if validate_entity_format(entity['text'], entity['type']) else "‚úó Invalid"
                    print(f"  {validation_result} - '{entity['text']}' ({entity['type']})")
            else:
                print("‚ùå He√ß bir valid entity tapƒ±lmadƒ±")

        except Exception as e:
            print(f"‚ùå X…ôta: {e}")

        print("-" * 70)

def detailed_analysis_demo(text: str):
    """
    EN: Shows detailed token-level analysis for a given sentence
    AZ: Detailed token-level analiz
    """
    print(f"\nüîç Detailed Analiz: '{text}'")

    try:
        tokens, model_labels = predict_entities_with_model(text, model, tokenizer, device)
        _, validated_labels = enhanced_predict_with_validation(text, model, tokenizer, device)

        print("\nToken-by-token analiz:")
        print("Token".ljust(15) + "Model".ljust(10) + "Validated".ljust(12) + "Valid?")
        print("-" * 50)

        for i, token in enumerate(tokens):
            model_label = model_labels[i] if i < len(model_labels) else 'O'
            validated_label = validated_labels[i] if i < len(validated_labels) else 'O'

            # EN: Check entity validation status
            # AZ: Entity-nin validation statusunu yoxla
            if model_label.startswith('B-'):
                entity_type = model_label[2:]
                # EN: Get full entity text
                # AZ: Entity-nin tam m…ôtnini tap
                entity_tokens = [token]
                j = i + 1
                while (j < len(tokens) and
                       j < len(model_labels) and
                       model_labels[j] == f'I-{entity_type}'):
                    entity_tokens.append(tokens[j])
                    j += 1
                entity_text = ''.join(entity_tokens)
                is_valid = validate_entity_format(entity_text, entity_type)
                valid_status = "‚úì" if is_valid else "‚úó"
            else:
                valid_status = "-"

            print(f"{token:<15} {model_label:<10} {validated_label:<12} {valid_status}")

    except Exception as e:
        print(f"‚ùå Analiz x…ôtasƒ±: {e}")

def interactive_blur_system():
    """
    EN: Interactive NER blur system with validation rules
    AZ: ƒ∞nteraktiv blurring sistemi
    """
    print("ü§ñ Az…ôrbaycan NER Blur Sistemi")
    print("Model + Validation qaydalarƒ± istifad…ô edilir")
    print("Format qaydalarƒ±:")
    print("  ‚Ä¢ FIN: 7 simvol (h…ôrf+r…ôq…ôm)")
    print("  ‚Ä¢ ID/AA: 9 simvol (AA + 7 r…ôq…ôm)")
    print("  ‚Ä¢ ID/AZE: 12 simvol (AZE + 9 r…ôq…ôm)")
    print("  ‚Ä¢ Avtomobil: XX-YY-ZZZ")
    print("\n√áƒ±xmaq √º√ß√ºn 'exit' yazƒ±n\n")

    while True:
        user_input = input("üìù M…ôtn daxil edin: ")

        if user_input.lower() in ['exit', '√ßƒ±x', 'quit', 'q']:
            print("üëã G√∂r√º≈ü…ôn…ôd…ôk!")
            break

        if user_input.strip():
            try:
                blurred_text, entities = blur_with_model_and_validation(user_input, model, tokenizer, device)
                print(f"üîí Blurred: {blurred_text}")

                if entities:
                    print("üìç Tapƒ±lan entity-l…ôr:")
                    for entity in entities:
                        print(f"  ‚Ä¢ '{entity['text']}' ‚Üí {entity['type']}")
                else:
                    print("üìç He√ß bir entity tapƒ±lmadƒ±")

                # EN: Show detailed analysis
                # AZ: Detailed analizi g√∂st…ôr
                detailed_analysis_demo(user_input)
                print()

            except Exception as e:
                print(f"‚ùå X…ôta: {e}\n")
        else:
            print("‚ùå Bo≈ü m…ôtn daxil etdiniz\n")

# EN: If model is trained and available, run demo
# AZ: ∆èg…ôr model √∂yr…ôdilib v…ô m√∂vcuddursa test et
if 'model' in locals():
    print("üéØ Model test edilir...")
    test_enhanced_model()

    print("\n" + "="*70)
    print("ƒ∞nteraktiv sistem…ô ke√ßm…ôk ist…ôyirsinizmi? (y/n)")
    interactive_blur_system()
else:
    print("‚ùå Model …ôvv…ôlc…ô √∂yr…ôdilm…ôlidir!")  # EN: Model must be trained first!

üéØ Model test edilir...
üß™ Model + Validation Test...

üìù Test 1:
Orijinal: M…ônim fin kodum AZEDF12 olan kartƒ±m var
Blurred:  M…ônim fin kodum [BLURRED] olan kartƒ±m var
‚úÖ Tapƒ±lan valid entity-l…ôr:
  ‚úì Valid - 'AZEDF12' (FIN)
----------------------------------------------------------------------
üìù Test 2:
Orijinal: FIN kod AB12345 m√∂vcuddur
Blurred:  FIN kod [BLURRED] m√∂vcuddur
‚úÖ Tapƒ±lan valid entity-l…ôr:
  ‚úì Valid - 'AB12345' (FIN)
----------------------------------------------------------------------
üìù Test 3:
Orijinal: Bu 90-AB-123 n√∂mr…ôli avtomobil dostumundur
Blurred:  Bu [BLURRED] n√∂mr…ôli avtomobil dostumundur
‚úÖ Tapƒ±lan valid entity-l…ôr:
  ‚úì Valid - '90-AB-123' (PLATE)
----------------------------------------------------------------------
üìù Test 4:
Orijinal: ≈û…ôxsiyy…ôt v…ôsiq…ô AA1234567 dir
Blurred:  ≈û…ôxsiyy…ôt v…ôsiq…ô [BLURRED] dir
‚úÖ Tapƒ±lan valid entity-l…ôr:
  ‚úì Valid - 'AA1234567' (ID)
---------------------------------------

In [None]:
!zip -r /content/best_model.zip /content/best_model

  adding: content/best_model/ (stored 0%)
  adding: content/best_model/model.safetensors (deflated 7%)
  adding: content/best_model/config.json (deflated 55%)
  adding: content/best_model/training_args.bin (deflated 53%)
  adding: content/best_model/vocab.txt (deflated 45%)
  adding: content/best_model/special_tokens_map.json (deflated 42%)
  adding: content/best_model/tokenizer.json (deflated 67%)
  adding: content/best_model/tokenizer_config.json (deflated 75%)
