# Data preparation

**The dataset full PHI dataset can be obtained from [I.PHI dataset](https://github.com/sommerschield/iphi).**

In [1]:
import json
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import re
import random
from tqdm import tqdm
import os

## Rebuild the original dataset

In [2]:
dialectal_info = json.load(open('data/iphi-archaic-dialect.json', 'r'))
iphi = json.load(open('data/iphi.json', 'r'))

In [17]:
from tqdm import tqdm

In [20]:
dataset_raw = []
for k, v in tqdm(dialectal_info.items()):
    current_inscr = None
    for inscr in iphi:
        if int(k) == inscr['id']:
            current_inscr = inscr
            break
    dataset_raw.append({**current_inscr, 'dialects': v})

100%|██████████| 3233/3233 [00:57<00:00, 55.82it/s]


## Dataset

Loading and splitting the dataset containing the dialectal information.

Due to Ithaca's requirments, the inscription $x$ must be:

 $50 <= |x| <= 750$

In [11]:
#dataset_raw = json.load(open('data/iphi-archaic_plusDialect_26oct23_fix.json'))

dataset_filtered = []
for x in dataset_raw:
    text = x['text']
    text = text.replace('[', '').replace(']', '')
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    if 'dialect' in x.keys():
        x['dialect'] = x['dialect'].split('.')[2]
    elif 'dialect_main_id' in x.keys():
        x['dialect'] = x['dialect_main_id'].split('.')[2]
        del x['dialect_main_id']
    else:
        continue

    text = text.replace(' .', '.')
        
    if 50 <= len(text) <= 750: # Ithaca requirement
        x['text'] = text
        dataset_filtered.append(x)
    
print("Raw dataset: ", len(dataset_raw))
print("Filtered:    ", len(dataset_raw) - len(dataset_filtered))
print("Remaining:    ", len(dataset_filtered))

Raw dataset:  3281
Filtered:     2351
Remaining:     930


Split in Train and Test set following Ithaca splitting rule:

```Last digit 3 -> test, 4 -> valid, the rest are the training set``` (see [dataloader.py](https://github.com/google-deepmind/ithaca/blob/ced13193aaa52e49a2388c9ace0244e9a24e6d42/train/dataloader.py#L307))

In [12]:
archaic_train_data = [x for x in dataset_filtered if x['id'] % 10 != 3 and x['id'] % 10 != 4]
archaic_validation_data = [x for x in dataset_filtered if x['id'] % 10 == 4]
archaic_test_data = [x for x in dataset_filtered if x['id'] % 10 == 3]

print(f"train: {len(archaic_train_data)}\nvalidation:   {len(archaic_validation_data)}\ntest:   {len(archaic_test_data)}")

train: 746
validation:   89
test:   95


In [13]:
os.makedirs("data/archaic", exist_ok=True)

with open('data/archaic/train.json', 'w') as f:
        json.dump(archaic_train_data, f)

with open('data/archaic/validation.json', 'w') as f:
        json.dump(archaic_validation_data, f)

with open('data/archaic/test.json', 'w') as f:
        json.dump(archaic_test_data, f)

## Testset masking

Using Ithaca masking code for generating mask between 1 and 10 characters.

### Utils

[text.py](https://github.com/google-deepmind/ithaca/blob/main/ithaca/util/text.py)

In [14]:
# Copyright 2021 the Ithaca Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

def text_to_idx(t, alphabet):
  """Converts a string to character indices."""
  return np.array([alphabet.char2idx[c] for c in t], dtype=np.int32)

def text_to_word_idx(t, alphabet):
  """Converts a string to word indices."""
  out = np.full(len(t), alphabet.word2idx[alphabet.unk], dtype=np.int32)
  for m in re.finditer(r'\w+', t):
    if m.group() in alphabet.word2idx:
      out[m.start():m.end()] = alphabet.word2idx[m.group()]
  return out

def random_mask_span(t, geometric_p=0.2, limit_chars=None):
  """Masks a span of sequential words."""

  # Obtain span indexes (indlusive)
  span_idx = [(ele.start(), ele.end()) for ele in re.finditer(r'[\w\s]+', t)]
  if not span_idx:
    return []

  # Select a span to mask
  span_start, span_end = random.choice(span_idx)

  # Sample a random span length using a geomteric distribution
  if geometric_p and limit_chars:
    span_len = np.clip(
        np.random.geometric(geometric_p),
        1, min(limit_chars, span_end - span_start))
  elif geometric_p:
    span_len = np.clip(
        np.random.geometric(geometric_p),
        1, span_end - span_start)
  elif limit_chars:
    span_len = min(limit_chars, span_end - span_start)
  else:
    raise ValueError('geometric_p or limit_chars should be set.')

  # Pick a random start index
  span_start = np.random.randint(span_start, span_end - span_len + 1)
  assert span_start + span_len <= span_end

  # Clip to limit chars
  if limit_chars is not None and span_len >= limit_chars:
    span_len = limit_chars

  # Create mask indices
  mask_idx = list(range(span_start, span_start + span_len))

  return mask_idx

[alphabet.py](https://github.com/google-deepmind/ithaca/blob/main/ithaca/util/alphabet.py)

In [15]:
# Copyright 2021 the Ithaca Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Alphabet classes."""

class Alphabet:
  """Generic alphabet class."""

  def __init__(self,
               alphabet,
               numerals='0',
               punctuation='.',
               space=' ',
               missing='-',
               pad='#',
               unk='^',
               sos='<',
               sog='[',
               eog=']',
               wordlist_file=None,
               wordlist_size=100000):
    self.alphabet = list(alphabet)  # alph
    self.numerals = list(numerals)  # num
    self.punctuation = list(punctuation)  # punt
    self.space = space  # spacing
    self.missing = missing  # missing char
    self.pad = pad  # padding (spaces to right of string)
    self.unk = unk  # unknown char
    self.sos = sos  # start of sentence
    self.sog = sog  # start of guess
    self.eog = eog  # end of guess

    # Define wordlist mapping
    idx2word = [self.pad, self.sos, self.unk]
    if wordlist_file:
      idx2word += [
          w_c.split(';')[0]
          for w_c in wordlist_file.read().strip().split('\n')[:wordlist_size]
      ]
    self.idx2word = np.array(idx2word)
    self.word2idx = {self.idx2word[i]: i for i in range(len(self.idx2word))}

    # Define vocab mapping
    self.idx2char = np.array(
        [self.pad, self.sos, self.unk, self.space, self.missing] +
        self.alphabet + self.numerals + self.punctuation)
    self.char2idx = {self.idx2char[i]: i for i in range(len(self.idx2char))}

    # Define special character indices
    self.pad_idx = self.char2idx[pad]
    self.sos_idx = self.char2idx[sos]
    self.unk_idx = self.char2idx[unk]
    self.alphabet_start_idx = self.char2idx[self.alphabet[0]]
    self.alphabet_end_idx = self.char2idx[self.numerals[-1]]

  def filter(self, t):
    return t

  def size_char(self):
    return len(self.idx2char)

  def size_word(self):
    return len(self.idx2word)


class GreekAlphabet(Alphabet):
  """Greek alphabet class."""

  def __init__(self, wordlist_file=None, wordlist_size=100000):
    greek_alphabet = 'αβγδεζηθικλμνξοπρςστυφχψωϙϛ'

    super().__init__(
        alphabet=greek_alphabet,
        wordlist_file=wordlist_file,
        wordlist_size=wordlist_size)
    self.tonos_to_oxia = {
        # tonos  : #oxia
        u'\u0386': u'\u1FBB',  # capital letter alpha
        u'\u0388': u'\u1FC9',  # capital letter epsilon
        u'\u0389': u'\u1FCB',  # capital letter eta
        u'\u038C': u'\u1FF9',  # capital letter omicron
        u'\u038A': u'\u1FDB',  # capital letter iota
        u'\u038E': u'\u1FF9',  # capital letter upsilon
        u'\u038F': u'\u1FFB',  # capital letter omega
        u'\u03AC': u'\u1F71',  # small letter alpha
        u'\u03AD': u'\u1F73',  # small letter epsilon
        u'\u03AE': u'\u1F75',  # small letter eta
        u'\u0390': u'\u1FD3',  # small letter iota with dialytika and tonos/oxia
        u'\u03AF': u'\u1F77',  # small letter iota
        u'\u03CC': u'\u1F79',  # small letter omicron
        u'\u03B0': u'\u1FE3',
        # small letter upsilon with dialytika and tonos/oxia
        u'\u03CD': u'\u1F7B',  # small letter upsilon
        u'\u03CE': u'\u1F7D'  # small letter omega
    }
    self.oxia_to_tonos = {v: k for k, v in self.tonos_to_oxia.items()}

  def filter(self, t):  # override previous filter function
    # lowercase
    t = t.lower()

    # replace dot below
    t = t.replace(u'\u0323', '')

    # replace perispomeni
    t = t.replace(u'\u0342', '')
    t = t.replace(u'\u02C9', '')

    # replace ending sigma
    t = re.sub(r'([\w\[\]])σ(?![\[\]])(\b)', r'\1ς\2', t)

    # replace oxia with tonos
    for oxia, tonos in self.oxia_to_tonos.items():
      t = t.replace(oxia, tonos)

    # replace h
    h_patterns = {
        # input: #target
        'ε': 'ἑ',
        'ὲ': 'ἓ',
        'έ': 'ἕ',
        'α': 'ἁ',
        'ὰ': 'ἃ',
        'ά': 'ἅ',
        'ᾶ': 'ἇ',
        'ι': 'ἱ',
        'ὶ': 'ἳ',
        'ί': 'ἵ',
        'ῖ': 'ἷ',
        'ο': 'ὁ',
        'ό': 'ὅ',
        'ὸ': 'ὃ',
        'υ': 'ὑ',
        'ὺ': 'ὓ',
        'ύ': 'ὕ',
        'ῦ': 'ὗ',
        'ὴ': 'ἣ',
        'η': 'ἡ',
        'ή': 'ἥ',
        'ῆ': 'ἧ',
        'ὼ': 'ὣ',
        'ώ': 'ὥ',
        'ω': 'ὡ',
        'ῶ': 'ὧ'
    }

    # iterate by keys
    for h_in, h_tar in h_patterns.items():
      # look up and replace h[ and h]
      t = re.sub(r'ℎ(\[?){}'.format(h_in), r'\1{}'.format(h_tar), t)
      t = re.sub(r'ℎ(\]?){}'.format(h_in), r'{}\1'.format(h_tar), t)

    # any h left is an ἡ
    t = re.sub(r'(\[?)ℎ(\]?)', r'\1ἡ\2', t)

    return t

In [16]:
# Computer start for prepending start of sentence character
def mask_text(text, alphabet):
  prepend_sos = 1
  char_mask_rate_min = 0.
  char_mask_rate_max = 0.5
  span_mask_geometric_p=0.1
  span_mask_eval_len=10
  span_mask_ratio=0.15
  mode = 'valid'

  start_sample_idx = int(prepend_sos)

  # Mask text
  text_mask = np.zeros(len(text), dtype=bool)
  if mode in ['train', 'valid']:
    text_list = list(text)

    # Non missing idx (avoid removing start of sentence character)
    non_missing_idx = []
    for i in range(start_sample_idx, len(text_list)):
      if text_list[i] not in [alphabet.missing] + alphabet.punctuation:
        non_missing_idx.append(i)

    # Skip sample if there are no usable characters
    if not non_missing_idx:
      print("Something is wrong")

    char_mask_idx = []
    if char_mask_rate_max > 0.:
      # Compute rate
      char_mask_rate = np.random.uniform(char_mask_rate_min,
                                          char_mask_rate_max)

      # Fix masking in valid mode for comparing experiments
      span_mask_geometric_p = span_mask_geometric_p
      mask_num_total = int(char_mask_rate * len(non_missing_idx))
      mask_num_span = int(mask_num_total * span_mask_ratio)
      
      if mode == 'valid' and span_mask_eval_len > 0:
        span_mask_geometric_p = None
        mask_num_total = min(span_mask_eval_len, len(non_missing_idx))
        mask_num_span = mask_num_total
      mask_num_char = mask_num_total - mask_num_span

      # Mask random indices
      if mask_num_char > 0:
        char_mask_idx = np.random.choice(
            non_missing_idx, mask_num_char, replace=False).tolist()

      # Mask random spans
      if mask_num_span > 0:
        count_span = 0
        span_mask_idx = []
        while (len(span_mask_idx) < mask_num_span and count_span < 1):
          span_mask_idx.extend(
              random_mask_span(
                  text,
                  geometric_p=span_mask_geometric_p,
                  limit_chars=random.randint(1,10)))
          count_span += 1
        char_mask_idx.extend(span_mask_idx)

    # Mask text
    for idx in set(char_mask_idx):
      text_mask[idx] = True
      text_list[idx] = '?'
    text = ''.join(text_list)

  return text

### Masking

In [60]:
# random.seed(42)

# test_data_masked = []
# alphabet = GreekAlphabet()

# for inscription in tqdm(archaic_test_data):
#     text = inscription['text']
#     inscription['masked'] = mask_text(text, alphabet)
#     test_data_masked.append(inscription)
            
# json.dump(test_data_masked, open('data/archaic/test_masked.json','w'))

100%|██████████| 95/95 [00:00<00:00, 13256.33it/s]


## Preparing the (archaic) knowledge base

In [17]:
with open('data/archaic/train.txt', 'w') as f:
    for inscription in archaic_train_data:
        f.write(f"{inscription['text']}\n")

with open('data/archaic/validation.txt', 'w') as f:
    for inscription in archaic_validation_data:
        f.write(f"{inscription['text']}\n")

with open('data/archaic/test.txt', 'w') as f:
    for inscription in archaic_test_data:
        f.write(f"{inscription['text']}\n")

## Full i.PHI dataset

In [18]:
dataset_tmp = {int(d['id']): d for d in json.load(open('data/iphi.json'))}
assert len(dataset_tmp) == 178551

In [19]:
rev_dataset = {}

black_list=set([
                      # 2334, 10, 293931, 14, 293752, 15, 293753, 16, 11,
                      # 294468, 229647, 12, 291324, 291317, 17, 232697, 293754,
                      # 1682, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 291118,
                      # 291320, 291319, 292366, 34, 291960, 35, 32, 346490, 27,
                      # 292187, 291318, 19, 18, 37, 291321, 292189, 293756, 42,
                      # 46, 232710, 39, 40, 41, 291322, 293757, 293327, 28,
                      # 292194, 293326, 21, 293755, 291319, 291117, 38, 291959,
                      # 31, 232705
                  ])

if black_list:
    print(black_list)

for key in sorted(dataset_tmp.keys()):
    value = dataset_tmp[key]
    rev_dataset.setdefault(value['text'], set()).add(key)
    if len(rev_dataset[value['text']]) > 1:
        black_list.add(int(value['id']))
del rev_dataset
print(len(black_list))

#assert len(black_list) == 9441

# Create deduplicated dataset
dataset = []
for d in dataset_tmp.values():
    if int(d['id']) not in black_list:
        dataset.append(d)
del dataset_tmp
del black_list

9441


In [20]:
def load_region_maps(region_file):
  """Extracts creates a map from PHI region id to a continuous region id."""
  region_ids = []  # Used mainly for eval
  region_ids_inv = {}  # Used in data loader
  region_names_inv = {}  # Used in eval
  for l in region_file.read().strip().split('\n'):
    tok_name_id, _ = l.strip().split(';')  # second field is frequency, unused
    region_name, region_id = tok_name_id.split('_')
    region_name = region_name.strip()
    region_id = int(region_id)
    # Ignore unknown regions:
    if ((region_name == 'Unknown Provenances' and region_id == 884) or
        (region_name == 'unspecified subregion' and region_id == 885) or
        (region_name == 'unspecified subregion' and region_id == 1439)):
      continue
    region_ids.append(region_id)
    region_ids_inv[region_id] = len(region_ids_inv)
    region_names_inv[len(region_names_inv)] = region_name

  return {
      'ids': region_ids,
      'ids_inv': region_ids_inv,
      'names_inv': region_names_inv
  }

region_map = dict()
region_map['main'] = load_region_maps(open('data/iphi-region-main.txt', 'r'))
region_map['sub'] = load_region_maps(open('data/iphi-region-sub.txt', 'r'))
alphabet = GreekAlphabet(wordlist_file=open('data/iphi-wordlist.txt', 'r'), wordlist_size=35884)

In [21]:
final_data = []
char_use_guess=True
context_char_min=50

for sample in dataset:

  new_sample = sample.copy()

  # Skip if region does not exist in map
  if (int(sample['region_main_id']) not in region_map['main']['ids_inv'] or
    int(sample['region_sub_id']) not in region_map['sub']['ids_inv']):
    continue

  # Replace guess signs with missing chars
  if not char_use_guess:
    sample['text'] = re.sub(r'\[(.*?)\]', lambda m: '-' * len(m.group(1)),
                            sample['text'])
  sample['text'] = sample['text'].replace(alphabet.sog,
                                          '').replace(alphabet.eog, '')
  
  sample['text'].replace('[', '').replace(']', '')
  sample['text'] = sample['text'].strip()
  sample['text'] = re.sub(r'\s+', ' ', sample['text'])

  # Filter by text length
  if len(sample['text'].replace(alphabet.missing,
                                '')) < context_char_min:
    continue
  
  sample['text'] = sample['text'].replace(' .', '.')
    
  if '  ' in sample['text']:
    print(sample['text'])
  final_data.append(sample)
print(len(final_data))

78591


In [22]:
iphi_train_data = [x for x in final_data if x['id'] % 10 != 3 and x['id'] % 10 != 4]
iphi_validation_data = [x for x in final_data if x['id'] % 10 == 4]
iphi_test_data = [x for x in final_data if x['id'] % 10 == 3]

print(f"train: {len(iphi_train_data)}\nvalidation:   {len(iphi_validation_data)}\ntest:   {len(iphi_test_data)}")

train: 63002
validation:   7780
test:   7809


In [23]:
os.makedirs("data/iphi", exist_ok=True)

with open('data/iphi/train.json', 'w') as f:
    json.dump(iphi_train_data, f)

with open('data/iphi/validation.json', 'w') as f:
        json.dump(iphi_validation_data, f)

with open('data/iphi/test.json', 'w') as f:
        json.dump(iphi_test_data, f)

In [24]:
with open('data/iphi/train.txt', 'w') as f:
    for inscription in iphi_train_data:
        f.write(f"{inscription['text']}\n")

with open('data/iphi/validation.txt', 'w') as f:
    for inscription in iphi_validation_data:
        f.write(f"{inscription['text']}\n")

with open('data/iphi/test.txt', 'w') as f:
    for inscription in iphi_test_data:
        f.write(f"{inscription['text']}\n")

# Same masking for agBERT and Ithaca

In [25]:
with open('data/archaic/test.txt') as f:
   test = f.readlines()


# test_json = json.load(open('data/archaic/test_masked.json'))
test_json = json.load(open('data/archaic/test.json'))

random.seed(0)

common_gaps = []

for inscription in tqdm(test_json):

    sentence = inscription['text']
    sentence_split = sentence.split()
    sentence_ithaca = sentence_split.copy()

    rand_int = random.randint(0, len(sentence_split)-1)
    ground_truth = sentence_split[rand_int]
    while '-' in ground_truth:
        rand_int = random.randint(0, len(sentence_split)-1)
        ground_truth = sentence_split[rand_int]
    if '.' in ground_truth:
        sentence_split[rand_int] = '[MASK].'
        sentence_ithaca[rand_int] = '?'*(len(ground_truth)-1) + '.'
        ground_truth = ground_truth.replace('.', '')
    else:
        sentence_split[rand_int] = '[MASK]'
        sentence_ithaca[rand_int] = '?'*len(ground_truth)

    masked = ' '.join(sentence_split)
    sentence_ithaca = ' '.join(sentence_ithaca)

    inscription['masked_ag'] = masked

    inscription['masked_ithaca'] = sentence_ithaca

    inscription['masked_gt'] = ground_truth
    
    common_gaps.append(inscription)

with open('data/archaic/test_common.json', 'w') as f:
    json.dump(common_gaps, f)

100%|██████████| 95/95 [00:00<00:00, 157743.02it/s]


In [26]:
archaic_validation_data = json.load(open('data/archaic/validation.json'))
random.seed(0)

common_gaps = []

for inscription in tqdm(archaic_validation_data):
    tolerance = 1000    
    sentence = inscription['text']
    sentence_split = sentence.split()
    sentence_ithaca = sentence_split.copy()

    rand_int = random.randint(0, len(sentence_split)-1)
    ground_truth = sentence_split[rand_int]
    while '-' in ground_truth:
        rand_int = random.randint(0, len(sentence_split)-1)
        ground_truth = sentence_split[rand_int]
        tolerance -= 1
        if tolerance == 0:
            break
    if tolerance == 0:
        continue
    if '.' in ground_truth:
        sentence_split[rand_int] = '[MASK].'
        sentence_ithaca[rand_int] = '?'*(len(ground_truth)-1) + '.'
        ground_truth = ground_truth.replace('.', '')
    else:
        sentence_split[rand_int] = '[MASK]'
        sentence_ithaca[rand_int] = '?'*len(ground_truth)

    masked = ' '.join(sentence_split)
    sentence_ithaca = ' '.join(sentence_ithaca)

    inscription['masked_ag'] = masked

    inscription['masked_ithaca'] = sentence_ithaca

    inscription['masked_gt'] = ground_truth
    
    common_gaps.append(inscription)

with open('data/archaic/validation_common.json', 'w') as f:
    json.dump(common_gaps, f)

100%|██████████| 89/89 [00:00<00:00, 70406.08it/s]


In [27]:
for x in common_gaps:
    if '-' in x['masked_gt']:
        print(x['text'])
        print(x['masked_ag'])
        print(x['masked_ithaca'])
        print(x['masked_gt'])
        print()

In [28]:
for aa in common_gaps:
    if '.' in aa['masked_gt']:
        print(aa['text'])
        print(aa['masked_ithaca'])
        print(aa['masked_ag'])
        print(aa['masked_gt'])
        assert False

In [29]:
for x in common_gaps:
    if x['id'] == 290213:
        print(x['text'])
        print(x['masked_ithaca'])
    if x['masked_ithaca'][-1] == '?':
        print(x['text'])
        print(x['masked'])
        print()


# RAG

In [3]:
with open('data/archaic/iphi_archaic_train.txt') as f:
   test = f.readlines()


test_json = json.load(open('data/archaic/iphi_archaic_train.json'))
random.seed(0)

rag_gaps = []

os.makedirs("data/rag", exist_ok=True)

for inscription in tqdm(test_json):

    sentence = inscription['text']
    sentence = sentence.strip()
    sentence_split = sentence.split()
    sentence_ithaca = sentence_split.copy()

    rand_int = random.randint(0, len(sentence_split)-1)
    ground_truth = sentence_split[rand_int]

    sentence_split[rand_int] = '[MASK]'
    
    if '.' in ground_truth:
        sentence_split[rand_int] = '[MASK].'
        sentence_ithaca[rand_int] = '?'*(len(ground_truth)-1) + '.'
        ground_truth = ground_truth.replace('.', '')
    else:
        sentence_split[rand_int] = '[MASK]'
        sentence_ithaca[rand_int] = '?'*len(ground_truth)

    masked = ' '.join(sentence_split)
    sentence_ithaca = ' '.join(sentence_ithaca)

    inscription['masked_ag'] = masked

    inscription['masked_ithaca'] = sentence_ithaca

    inscription['masked_gt'] = ground_truth
    
    rag_gaps.append(inscription)

with open('data/rag/iphi_train.json', 'w') as f:
    json.dump(rag_gaps, f)

100%|██████████| 808/808 [00:00<00:00, 212011.11it/s]


In [49]:
with open('data/iphi/test.txt') as f:
   test = f.readlines()


test_json = json.load(open('data/iphi/test.json'))
random.seed(0)

rag_gaps = []

os.makedirs("data/rag", exist_ok=True)

for inscription in tqdm(test_json):

    sentence = inscription['text']
    sentence = sentence.strip()
    sentence_split = sentence.split()
    sentence_ithaca = sentence_split.copy()

    rand_int = random.randint(0, len(sentence_split)-1)
    ground_truth = sentence_split[rand_int]

    sentence_split[rand_int] = '[MASK]'
    
    if '.' in ground_truth:
        sentence_split[rand_int] = '[MASK].'
        sentence_ithaca[rand_int] = '?'*(len(ground_truth)-1) + '.'
        ground_truth = ground_truth.replace('.', '')
    else:
        sentence_split[rand_int] = '[MASK]'
        sentence_ithaca[rand_int] = '?'*len(ground_truth)

    masked = ' '.join(sentence_split)
    sentence_ithaca = ' '.join(sentence_ithaca)

    inscription['masked_ag'] = masked

    inscription['masked_ithaca'] = sentence_ithaca

    inscription['masked_gt'] = ground_truth
    
    rag_gaps.append(inscription)

with open('data/rag/iphi_test_rag.json', 'w') as f:
    json.dump(rag_gaps, f)

100%|██████████| 7809/7809 [00:00<00:00, 126173.76it/s]


# Training Archaic

750-450 BCE

In [31]:
iphi_train = json.load(open('data/iphi/train.json'))
iphi_validation = json.load(open('data/iphi/validation.json'))
iphi_test = json.load(open('data/iphi/test.json'))

In [32]:
iphi_archaic_train = []
for inscription in iphi_train:
    if inscription['date_min'] == None or inscription['date_max'] == None:
        continue
    if inscription['date_min'] == None or inscription['date_max'] == None:
        continue
    date_min = int(inscription['date_min'])
    date_max = int(inscription['date_max'])
    if not 50 <= len(inscription['text']) <= 750:
        continue
    if -750 <= date_min <= -450 and -750 <= date_max <= -450:
        iphi_archaic_train.append(inscription)

iphi_archaic_validation = []
for inscription in iphi_validation:
    if inscription['date_min'] == None or inscription['date_max'] == None:
        continue
    if inscription['date_min'] == None or inscription['date_max'] == None:
        continue
    date_min = int(inscription['date_min'])
    date_max = int(inscription['date_max'])
    if not 50 <= len(inscription['text']) <= 750:
        continue
    if -750 <= date_min <= -450 and -750 <= date_max <= -450:
        iphi_archaic_validation.append(inscription)

iphi_archaic_test = []
for inscription in iphi_test:
    if inscription['date_min'] == None or inscription['date_max'] == None:
        continue
    if inscription['date_min'] == None or inscription['date_max'] == None:
        continue
    date_min = int(inscription['date_min'])
    date_max = int(inscription['date_max'])
    if not 50 <= len(inscription['text']) <= 750:
        continue
    if -750 <= date_min <= -450 and -750 <= date_max <= -450:
        iphi_archaic_test.append(inscription)
    

In [33]:
os.makedirs("data/archaic", exist_ok=True)

with open('data/archaic/iphi_archaic_train.json', 'w') as f:
    json.dump(iphi_archaic_train, f)

with open('data/archaic/iphi_archaic_validation.json', 'w') as f:
    json.dump(iphi_archaic_test, f)

with open('data/archaic/iphi_archaic_test.json', 'w') as f:
    json.dump(iphi_archaic_test, f)

In [34]:
with open('data/archaic/iphi_archaic_train.txt', 'w') as f:
    for inscription in iphi_archaic_train:
        f.write(f"{inscription['text']}\n")

with open('data/archaic/iphi_archaic_validation.txt', 'w') as f:
    for inscription in iphi_archaic_test:
        f.write(f"{inscription['text']}\n")

with open('data/archaic/iphi_archaic_test.txt', 'w') as f:
    for inscription in iphi_archaic_test:
        f.write(f"{inscription['text']}\n")