In [None]:
#!pip install transformers torch #for Windows
#%pip install transformers torch #for MacOS


Note: you may need to restart the kernel to use updated packages.


In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

# Define model identifier
model_name = "asafaya/bert-base-arabic"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_phrase_embedding(text, max_length=512):
    """
    Extracts a phrase embedding from Arabic text using BERT.
    
    Parameters:
      text (str): The Arabic text to encode.
      max_length (int): Maximum token length to use (truncation is applied if needed).
    
    Returns:
      numpy.ndarray: The phrase embedding as a numpy array.
    """
    # Tokenize the input text with truncation enabled
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    
    # Disable gradient calculation for inference
    with torch.no_grad():
        output = model(**tokens)
    
    # Mean pooling of token embeddings (excluding padding tokens if necessary)
    # Here we assume a simple mean pooling over all tokens in the sequence.
    embedding = output.last_hidden_state.mean(dim=1)
    
    return embedding.numpy()

# Example usage:
if __name__ == "__main__":
    text = "الذكاء الاصطناعي هو المستقبل"
    embedding = get_phrase_embedding(text)
    print("Embedding shape:", embedding.shape)  # Expected shape: (1, 768)
    print("Embedding      :", embedding)  # Expected shape: (1, 768)


Embedding shape: (1, 768)
Embedding      : [[ 2.39913344e-01 -6.96258366e-01 -4.52943772e-01 -2.63912946e-01
  -2.27475241e-01  5.07016890e-02 -5.71887910e-01  7.99060985e-02
  -4.23441261e-01  6.09665334e-01  7.47164711e-02 -1.02992141e+00
   8.87437344e-01 -3.38536739e-01 -3.79198343e-01  1.20933962e+00
   7.96513796e-01 -2.72334278e-01 -2.33510256e-01  5.47210395e-01
  -1.54994354e-01  1.97327793e-01 -6.00636184e-01 -3.97663087e-01
   1.71893910e-01 -8.14190865e-01 -1.81105688e-01 -8.30575049e-01
  -7.82404244e-01  3.92520428e-01 -1.00008953e+00 -5.39949775e-01
  -5.60316928e-02  5.41875958e-02 -4.46842164e-01  1.76629107e-02
   1.65091619e-01  7.82049894e-01  1.58529207e-01  1.34620190e-01
   3.77940506e-01 -1.62520573e-01  6.27383709e-01 -5.07663310e-01
  -7.59734929e-01  8.34939241e-01  2.84937266e-02  8.36442947e-01
   9.91766155e-01 -1.03866793e-01 -6.12487972e-01 -4.83972818e-01
  -2.27286652e-01 -3.00865173e-02  5.76592028e-01  8.99099186e-02
   1.26165748e-01  1.55742764e-01