In [1]:
import numpy as np
import pandas as pd
import os
import sys
from pathlib import Path

In [2]:
work_dir = '/home/nlp/achimoa/workspace/hebrew_text_retrieval/'
os.chdir(work_dir)

src_dir = os.path.join(work_dir, 'src')
if sys.path[0] not in src_dir:
    sys.path.insert(0, src_dir)

In [15]:
split = 'train'  # Change to 'validation' if needed
df_queries = pd.read_csv(f'outputs/translation/rajpurkar_squad_v2/{split}/gemini-2.0-flash-lite/queries_translated.csv')
df_documents = pd.read_csv(f'outputs/translation/rajpurkar_squad_v2/{split}/gemini-2.0-flash-lite/documents_translated.csv')

In [4]:
print(list(df_queries.keys()))
print(df_queries.shape)
print(list(df_documents.keys()))
print(df_documents.shape)

['id', 'title', 'context', 'question', 'answers', 'context_hash', 'text', 'context_id', 'context_text', 'dataset_name', 'tokenizer', 'text_key', 'translation_key', 'context_key', 'translation_query_key', 'translation_document_key', 'system_prompt', 'user_prompt', 'prompt_prefix', 'prompt_file_name', 'translation', 'input_tokens', 'output_tokens', 'model_name', 'model_time', 'translation_time', 'timestamp', 'batch_idx', 'batch_size', 'batch_datetime', 'translation_datetime']
(130319, 31)
['id', 'title', 'context', 'question', 'answers', 'context_hash', 'text', 'segment_id', 'segment_text', 'dataset_name', 'tokenizer', 'text_key', 'translation_key', 'context_key', 'translation_query_key', 'translation_document_key', 'system_prompt', 'user_prompt', 'prompt_prefix', 'prompt_file_name', 'translation', 'input_tokens', 'output_tokens', 'model_name', 'model_time', 'translation_time', 'timestamp', 'batch_idx', 'batch_size', 'batch_datetime', 'translation_datetime']
(19029, 31)


In [5]:
# Get the set of GUIDs from df_documents
document_guids = set(df_documents['context_hash'])

# Check which df_queries['guid'] values exist in df_documents
mask = df_queries['context_hash'].isin(document_guids)

# Count the number of matching records
matching_count = mask.sum()

print(f"Number of matching GUIDs: {matching_count}")


Number of matching GUIDs: 130319


In [6]:
df_documents[['translation']].head()

Unnamed: 0,translation
0,ביונסה ג'יזל נואלס-קרטר (/biːˈjɒnseɪ/ bee-YON-...
1,"בעקבות התפרקות דסטיניז צ'יילד ביוני 2005, היא ..."
2,"ביונסה, המתארת את עצמה כ""פמיניסטית מודרנית"", י..."
3,"ביונסה ג'יזל נואלס נולדה ביוסטון, טקסס, ל-Cele..."
4,ביונסה למדה בבית הספר היסודי סנט מרי בפדרסבורג...


In [7]:
df_documents[['id', 'context_hash', 'text', 'segment_id']]

Unnamed: 0,id,context_hash,text,segment_id
0,5566f9c0998385b8a8a2c94aa64aa980,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0
1,aad018fe3b6d1cb98a948e676d7c8036,aad018fe3b6d1cb98a948e676d7c8036,Following the disbandment of Destiny's Child i...,0
2,0b6ba39a81e6060caf33482bb8f79117,0b6ba39a81e6060caf33482bb8f79117,"A self-described ""modern-day feminist"", Beyonc...",0
3,9416bece77bd0d876cb5ac8df8bd4ad5,9416bece77bd0d876cb5ac8df8bd4ad5,"Beyoncé Giselle Knowles was born in Houston, T...",0
4,1a11be03640af35d4723f569f9d42dbf,1a11be03640af35d4723f569f9d42dbf,Beyoncé attended St. Mary's Elementary School ...,0
...,...,...,...,...
19024,79bd2dfb9624397f5730d992d7007549,79bd2dfb9624397f5730d992d7007549,Isaac Newton (1643–1727) inherited Descartes' ...,0
19025,2e90d7f5f88b129c1900db260468da11,2e90d7f5f88b129c1900db260468da11,"There is an entire literature concerning the ""...",0
19026,3775c0b780f8891a77976ebcfc4e3b7d,3775c0b780f8891a77976ebcfc4e3b7d,In the late 19th century with the discovery of...,0
19027,57dabe0131de7aed80e210cafc7c1380,57dabe0131de7aed80e210cafc7c1380,These quarks and leptons interact through four...,0


In [None]:
df_documents2 = df_documents.groupby(['id', 'context_hash', 'text'], as_index=False).agg({
    'translation': lambda texts: ' '.join([str(t) for t in texts])
})

# Step 2: Rename columns as needed
df_documents2 = df_documents2.rename(columns={
    'text': 'context_english',
    'translation': 'context_hebrew'
})

df_documents2['_source'] = 'rajpurkar_squad_v2'
print(df_documents2.columns)

# Step 3: Select required columns
df_documents2 = df_documents2[['id', 'context_hash', 'context_english', 'context_hebrew', '_source']]

# Step 4: Save the DataFrame to a JSON file
output_file = f'data/squad_v2/{split}/documents.jsonl'
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
df_documents2.to_json(output_file, orient='records', lines=True, force_ascii=False)

Index(['id', 'context_hash', 'context_english', 'context_hebrew', '_source'], dtype='object')


In [10]:
df_documents2.head(100)

Unnamed: 0,id,context_hash,context_english,context_hebrew,_source
0,000038abe02149e4abc7537083844eae,000038abe02149e4abc7537083844eae,This period of renewed assertiveness came to a...,תקופה זו של תקיפות מחודשת הגיעה לסיומה הקטסטרו...,rajpurkar_squad_v2
1,00030ca16729eadf67fe4c79aeef7664,00030ca16729eadf67fe4c79aeef7664,The interceptor aircraft (or simply intercepto...,מטוס יירוט (או בפשטות מיירט) הוא סוג של מטוס ק...,rajpurkar_squad_v2
2,0003afaccf271bd7a18d896314c531ce,0003afaccf271bd7a18d896314c531ce,"On runways, green lights indicate the beginnin...","על מסלולי המראה, אורות ירוקים מציינים את תחילת...",rajpurkar_squad_v2
3,000a6d9d751f7e955046e4c09efb2fbb,000a6d9d751f7e955046e4c09efb2fbb,The light energy captured by chlorophyll a is ...,אנרגיית האור הלוכדת על ידי כלורופיל a היא בתחי...,rajpurkar_squad_v2
4,0011fa1930606bf608665430563fa017,0011fa1930606bf608665430563fa017,Chengdu Economic and Technological Development...,אזור הפיתוח הכלכלי והטכנולוגי של צ'נגדו (בסיני...,rajpurkar_squad_v2
...,...,...,...,...,...
95,015926d7c589bddf6af8362fc6e524c0,015926d7c589bddf6af8362fc6e524c0,Although there were a number of department sto...,למרות שהיו מספר חנויות כלבו באוסטרליה במשך רוב...,rajpurkar_squad_v2
96,0159f0deefaec5678ec0d3dbed680304,0159f0deefaec5678ec0d3dbed680304,"Guests ascending to the 67th, 69th, and 70th l...","אורחים העולים למפלסי התצפית ה-67, ה-69 וה-70 (...",rajpurkar_squad_v2
97,015a019a0db5344d017b18fea41dcb26,015a019a0db5344d017b18fea41dcb26,North Carolina provides a large range of recre...,צפון קרוליינה מספקת מגוון רחב של פעילויות פנאי...,rajpurkar_squad_v2
98,0160134abebe04f5b7c4d268a7ad87f2,0160134abebe04f5b7c4d268a7ad87f2,The change is evident in the reports of influe...,השינוי ניכר בדיווחיהם של מטיילים בריטים משפיעי...,rajpurkar_squad_v2


In [11]:
df_documents2.shape

(19029, 5)

In [12]:
df_queries.head()

Unnamed: 0,id,title,context,question,answers,context_hash,text,context_id,context_text,dataset_name,...,input_tokens,output_tokens,model_name,model_time,translation_time,timestamp,batch_idx,batch_size,batch_datetime,translation_datetime
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start'...",5566f9c0998385b8a8a2c94aa64aa980,When did Beyonce start becoming popular?,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,246.0,36.0,gemini-2.0-flash-lite,2.587168,2.587183,2025-07-08 13:24:52.976154,0,1,2025-07-08 13:24:50.388942,2025-07-08 13:24:52.976160
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"{'text': ['singing and dancing'], 'answer_star...",5566f9c0998385b8a8a2c94aa64aa980,What areas did Beyonce compete in when she was...,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,251.0,46.0,gemini-2.0-flash-lite,2.492141,2.492155,2025-07-08 13:24:52.881498,1,1,2025-07-08 13:24:50.389315,2025-07-08 13:24:52.881503
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"{'text': ['2003'], 'answer_start': [526]}",5566f9c0998385b8a8a2c94aa64aa980,When did Beyonce leave Destiny's Child and bec...,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,253.0,59.0,gemini-2.0-flash-lite,2.61493,2.614943,2025-07-08 13:24:53.082593,2,1,2025-07-08 13:24:50.467624,2025-07-08 13:24:53.082598
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [...",5566f9c0998385b8a8a2c94aa64aa980,In what city and state did Beyonce grow up?,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,251.0,40.0,gemini-2.0-flash-lite,2.492606,2.492623,2025-07-08 13:24:52.964039,3,1,2025-07-08 13:24:50.471388,2025-07-08 13:24:52.964047
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}",5566f9c0998385b8a8a2c94aa64aa980,In which decade did Beyonce become famous?,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,247.0,36.0,gemini-2.0-flash-lite,2.455187,2.455204,2025-07-08 13:24:52.928191,4,1,2025-07-08 13:24:50.472951,2025-07-08 13:24:52.928199


In [13]:
df_queries.columns

Index(['id', 'title', 'context', 'question', 'answers', 'context_hash', 'text',
       'context_id', 'context_text', 'dataset_name', 'tokenizer',
       'text_key', 'translation_key', 'context_key', 'translation_query_key',
       'translation_document_key', 'system_prompt', 'user_prompt', 'prompt_prefix',
       'prompt_file_name', 'translation', 'input_tokens', 'output_tokens',
       'model_name', 'model_time', 'translation_time', 'timestamp',
       'batch_idx', 'batch_size', 'batch_datetime', 'translation_datetime'],
      dtype='object')

In [14]:
guid_to_context_hebrew = dict(zip(df_documents2['context_hash'], df_documents2['context_hebrew']))
guid_to_context_english = dict(zip(df_documents2['context_hash'], df_documents2['context_english']))

df_queries2 = df_queries.copy()
df_queries2 = df_queries2.rename(columns={
    'text': 'question_english', 
    'translation': 'question_hebrew'
})

df_queries2["context_english"] = df_queries2["context_id"].map(guid_to_context_english)
df_queries2["context_hebrew"] = df_queries2["context_id"].map(guid_to_context_hebrew)

df_queries2 = df_queries2[['id', 'context_id', 'question_english', 'question_hebrew', 'context_english', 'context_hebrew']]
df_queries2['_source'] = 'rajpurkar_squad_v2'
df_queries2.head()

Unnamed: 0,id,context_id,question_english,question_hebrew,context_english,context_hebrew,_source
0,56be85543aeaaa14008c9063,5566f9c0998385b8a8a2c94aa64aa980,When did Beyonce start becoming popular?,מתי ביונסה התחילה להיות פופולרית?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,ביונסה ג'יזל נואלס-קרטר (/biːˈjɒnseɪ/ bee-YON-...,rajpurkar_squad_v2
1,56be85543aeaaa14008c9065,5566f9c0998385b8a8a2c94aa64aa980,What areas did Beyonce compete in when she was...,באיזה תחומים ביונסה התחרתה כשהייתה צעירה?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,ביונסה ג'יזל נואלס-קרטר (/biːˈjɒnseɪ/ bee-YON-...,rajpurkar_squad_v2
2,56be85543aeaaa14008c9066,5566f9c0998385b8a8a2c94aa64aa980,When did Beyonce leave Destiny's Child and bec...,מתי ביונסה עזבה את דסטיניז צ'יילד והפכה לזמרת ...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,ביונסה ג'יזל נואלס-קרטר (/biːˈjɒnseɪ/ bee-YON-...,rajpurkar_squad_v2
3,56bf6b0f3aeaaa14008c9601,5566f9c0998385b8a8a2c94aa64aa980,In what city and state did Beyonce grow up?,באיזו עיר ומדינה גדלה ביונסה?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,ביונסה ג'יזל נואלס-קרטר (/biːˈjɒnseɪ/ bee-YON-...,rajpurkar_squad_v2
4,56bf6b0f3aeaaa14008c9602,5566f9c0998385b8a8a2c94aa64aa980,In which decade did Beyonce become famous?,באיזה עשור ביונסה התפרסמה?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,ביונסה ג'יזל נואלס-קרטר (/biːˈjɒnseɪ/ bee-YON-...,rajpurkar_squad_v2


In [16]:
# Save the DataFrame to a JSON file
output_file = f'data/squad_v2/{split}/queries.jsonl'
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
df_queries2.to_json(output_file, orient='records', lines=True, force_ascii=False)

In [20]:
output_file

'data/squad_v2/train/queries.jsonl'

In [17]:
df_queries.head()

Unnamed: 0,id,title,context,question,answers,context_hash,text,context_id,context_text,dataset_name,...,input_tokens,output_tokens,model_name,model_time,translation_time,timestamp,batch_idx,batch_size,batch_datetime,translation_datetime
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start'...",5566f9c0998385b8a8a2c94aa64aa980,When did Beyonce start becoming popular?,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,246.0,36.0,gemini-2.0-flash-lite,2.587168,2.587183,2025-07-08 13:24:52.976154,0,1,2025-07-08 13:24:50.388942,2025-07-08 13:24:52.976160
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"{'text': ['singing and dancing'], 'answer_star...",5566f9c0998385b8a8a2c94aa64aa980,What areas did Beyonce compete in when she was...,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,251.0,46.0,gemini-2.0-flash-lite,2.492141,2.492155,2025-07-08 13:24:52.881498,1,1,2025-07-08 13:24:50.389315,2025-07-08 13:24:52.881503
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"{'text': ['2003'], 'answer_start': [526]}",5566f9c0998385b8a8a2c94aa64aa980,When did Beyonce leave Destiny's Child and bec...,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,253.0,59.0,gemini-2.0-flash-lite,2.61493,2.614943,2025-07-08 13:24:53.082593,2,1,2025-07-08 13:24:50.467624,2025-07-08 13:24:53.082598
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [...",5566f9c0998385b8a8a2c94aa64aa980,In what city and state did Beyonce grow up?,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,251.0,40.0,gemini-2.0-flash-lite,2.492606,2.492623,2025-07-08 13:24:52.964039,3,1,2025-07-08 13:24:50.471388,2025-07-08 13:24:52.964047
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}",5566f9c0998385b8a8a2c94aa64aa980,In which decade did Beyonce become famous?,5566f9c0998385b8a8a2c94aa64aa980,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,rajpurkar/squad_v2,...,247.0,36.0,gemini-2.0-flash-lite,2.455187,2.455204,2025-07-08 13:24:52.928191,4,1,2025-07-08 13:24:50.472951,2025-07-08 13:24:52.928199


In [19]:
from tqdm import tqdm
from datasets import load_dataset

data_files = {
        "queries": {
            "test": f'data/retrieval/squad_v2/{split}/queries.jsonl'
        },
        "documents": {
            "test": f'data/retrieval/squad_v2/{split}/documents.jsonl'
        }
    }

query_text_field = "question_english"
query_context_field = "context_hebrew"
document_text_field = "context_hebrew"
document_source_field = "_source"
main_source = "documents.jsonl"

queries_dataset = load_dataset("json", data_files=data_files["queries"], split="test")
documents_dataset = load_dataset("json", data_files=data_files["documents"], split="test")

questions = [item[query_text_field] for item in tqdm(queries_dataset, desc="Loading queries")]
gold = [item[query_context_field] for item in tqdm(queries_dataset, desc="Loading gold documents")]  # list of lists

unique_contexts = set()
deduped_contexts = []
contexts0 = [item for item in tqdm(documents_dataset, desc="Loading documents")]
# Deduplicate contexts
contexts0.sort(key=lambda x: 0 if x[document_source_field] == main_source else 1)
for i, context in tqdm(enumerate(contexts0), desc="Deduplicating contexts"):
    if context["guid"] not in unique_contexts:
        unique_contexts.add(context["guid"])
        deduped_contexts.append(context)
contexts = [c[document_text_field] for c in deduped_contexts]
print(len(deduped_contexts), "unique contexts found.")

# Find how many gold contexts are in the documents
found_gold_contexts = [c for c in set(gold) if c in contexts]
print(f"Found {len(found_gold_contexts)} gold contexts in the documents ({len(set(gold))}).")

FileNotFoundError: Unable to find '/home/nlp/achimoa/workspace/hebrew_text_retrieval/data/retrieval/squad_v2/train/queries.jsonl'

In [67]:
data_files2 = {
        "queries": {
            "test": 'data/retrieval/squad_v2/validation/queries.jsonl'
        },
        "documents": {
            "test": 'data/retrieval/squad_v2/validation/documents2.jsonl'
        }
}    

queries_dataset2 = load_dataset("json", data_files=data_files2["queries"], split="test")
documents_dataset2 = load_dataset("json", data_files=data_files2["documents"], split="test")
len(set([item["context_hebrew"] for item in tqdm(documents_dataset2, desc="Loading documents") if item["text"] is not None]))

Loading documents: 100%|██████████| 276204/276204 [00:20<00:00, 13439.70it/s]


1205

In [73]:
documents_dataset2[125000]

{'id': None,
 'context_hash': None,
 'context_english': None,
 'context_hebrew': None,
 '_source': 'wikipedia_hebrew_AllOfNewHebrewWikipediaWithArticles-Oct29-2023.forgpt.jsonl',
 'guid': 'ab2b029a96889c5149c7943a8e8452e5b206fcbaa2c401c5b8535f2d7f9c0c7e',
 'text': 'טוקיו דום\nטוקיו דום\nטוקיו דום (ביפנית: <foreign>; בבורסה:9681) הוא אצטדיון בייסבול הממוקם ברובע בונקיו שבטוקיו. הוא מכיל כ-55,000 מושבים והוא אצטדיון הבייסבול המקורה הגדול בעולם.\nהאצטדיון נפתח ב-17 במרץ 1988. הוא נבנה באתר בו שכן ולודרום ואצטדיון "קורקואן". כמו קורקואן לפניו, האצטדיון מארח מדי שנה את Toei Superheroes, מופע גיבורי על עם מסורת ארוכת שנים.\nהטוקיו דום מכונה "הביצה הגדולה" או "הביצה הגדולה של טוקיו" הודות לצורת הכיפה של גג האצטדיון. הגג הוא קרום גמיש שצורתו נשמרת על ידי הפעלת לחץ אוויר מועט אך רציף מצדו הפנימי של האצטדיון.\nהטוקיו דום הוא ביתה של קבוצת הבייסבול יומיורו ג\'יינטס, אך הוא מארח גם אירועים אחרים: משחקי כדורסל, פוטבול, כדורגל, אירועי היאבקות מקצועית, אמנויות לחימה משולבות, קיקבוקסינג, ומרוצי Monste

In [None]:
len(questions), len(gold), len(contexts)

(1156, 1156, 0)

In [37]:
queries_dataset[0]

{'id': '56ddde6b9a695914005b9629',
 'context_id': '4792c0101c483fd5cb5191f0d39d4426',
 'question_english': 'When were the Normans in Normandy?',
 'question_hebrew': 'מתי הנורמנים היו בנורמנדי?',
 'context_english': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'context_hebrew': 'הנורמנים (נורמנית: 

In [41]:
unique_contexts

set()