In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
torch.set_float32_matmul_precision("high") 

In [8]:
sentences = [
    "[MASK], Osmanlı İmparatorluğu'nun ilk başkentidir.",
    "[MASK], Türkiye Cumhuriyeti'nin başkentidir.",
    "[MASK] Sistemi'ndeki en büyük gezegen Jüpiter'dir.",
    "The capital of France is [MASK]."
]

In [6]:
# TabiBERT:
model_path = "boun-tabilab/TabiBERT"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ModernBertForMaskedLM(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50176, 768, padding_idx=0)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      )
      (1-21): 21

In [9]:
for text in sentences:
    print(f"Sentence: {text}")
    inputs = tokenizer(text, return_tensors="pt").to(device) 
    outputs = model(**inputs)

    masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
    predicted_id = outputs.logits[0, masked_index].argmax(axis=-1)
    print("Predicted token:", tokenizer.decode(predicted_id))

Sentence: [MASK], Osmanlı İmparatorluğu'nun ilk başkentidir.
Predicted token: İstanbul
Sentence: [MASK], Türkiye Cumhuriyeti'nin başkentidir.
Predicted token: Ankara
Sentence: [MASK] Sistemi'ndeki en büyük gezegen Jüpiter'dir.
Predicted token: Güneş
Sentence: The capital of France is [MASK].
Predicted token:  Paris
