# CombineSampleTexts

I endeavor to create a file called `ces.training_text` containing the concatenated content of all old Assyrian sample texts that are available in our collection.

In [1]:
from numpy.random import default_rng
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/AWCA/Colab_notebooks/OCR/pDonovan/awca-ocr/tesstrain/
!ls

/content/drive/.shortcut-targets-by-id/1W2EROe2FItlaK99U-WY_qaBOc2UD_LI0/AWCA/Colab_notebooks/OCR/pDonovan/awca-ocr/tesstrain
cdliatf_unblocked.atf	  jsonzip	      OATP_unpublished.txt
CombineSampleTexts.ipynb  oatp_processed.txt  output
fontlist-ugly.txt	  OATP_published.txt  README.md


## Processing the UR III Dataset

In [7]:
def random_substring(text: str, length: int, rng = default_rng()) -> str:
  """Returns a substring of length `length` selected uniformly at random from
  `text`. (This is uniform in the sense that all possible start and end points
  are selected with equal probability, not in the sense that all text is
  contained in the returned substring with equal probability.)
  :param text: an arbitrary string
  :param length: the length of the desired substring
  :param rng: a numpy Generator instance
  """
  assert len(text) >= length, ("text is not long enough to contain a substring "
                              "of the desired length.")
  start = rng.integers(0, len(text) - length)
  return text[start:start+length]

In [5]:
def process_oatp(oatp: str) -> str:
  """Returns `oatp` with alterations intended to improve OCR training outcomes.
  """
  # Prevent huge strings of text from all being on one line
  ret = oatp.replace('[\t]', '[  ]').replace('\t', '\n')
  ret = re.sub(r'@\S*', '', ret) # remove annotations such as @END_FILE
  ret = re.sub(r'(^|\s)\\\S+', '\n', ret) # remove annotations such as \documentstyle
  ret = re.sub(r'\n+\s*', '\n', ret) # remove redundant newlines
  return ret

In [8]:
with open('OATP_published.txt', 'r') as published:
  with open('OATP_unpublished.txt', 'r') as unpublished:
    oatp = published.read() + unpublished.read()
print(random_substring(process_oatp(oatp), 1000))

na [ra]-mì-ni-a aš-qúl 
3 5/6 ma-na 3 1/4 GÍN AN.NA 
ša ni-ip-lá-tim i-li-bi-šu-ma 10
1 ma-na 15 GÍN KÙ.BABBAR 
ù 5 TÚG ku-ta-ni 
[A-m]u-ra DUMU Am-ri-a ip-qí-da-šu lo.e.
[x] ma-na 5 GÍN KÙ.BABBAR rev.
ša Puzur4-A-šùr áš-qúl 15
[x ma]-na KÙ.BABBAR 
[
] URUDU 
ša a-lim.ki a-na ša ki-ma 
Pu-šu-ke-en6 / ah-bu-ul 
mì-ma a-nim i-li-bi 20
Pu-šu-ke-en6 / i-šu 
(1 Z. unbeschr.) 
9 ma-na 3 GÍN AN.NA 
ni-ip-la-tim ša a-na-kam 
ù 2/3 ma-na 2 GÍN AN.NA (Ras.) 
ša Tí-me-el-ki-a Dan-A-šur DUMU Ša-lim-a-hi-im 25
5 ma-na 19 GIN u.e.
i-li-bi Bu-ur-Sú-en6 le.e.
4 1/3 ma-na 1/2 GÍN AN.N[A] 
i-li-bi Šu?-x-[x] 
DUMU Ás-qú!-du[m?
] 
x     TC 3, 168 #3 AO 8630 ##  1
2 ma-na AN.NA 
KI I-dí-Ištar 
3 1/2 ma-na AN.NA 
KI En-nam-A-šur sà-ku-ku 5
15 ma-na URUDU SIG5 
iš-tí A-šur-mu-ta-bi4-il5 
6 ma-na LÁ 10 GÍN 
URUDU SIG5 iš-tí lo.e.
A-šur-na-da rev. 10
5 1(3 GÍN AN.NA 
a-šu-mì kà-ri-im 
a-na nu-a-ri-im 
a-dí-in 2 GÚ 46 ma-na 
SÍG.HI.A 10 GÍN AN.NA 15
a-na A-gi5-a e-zi-ib 
5 2/3 ma-na URUDU SIG5 
i-li-bi4 u.e.
I-

In [12]:
sum(1 for c in oatp if c == '\n') / 3000

5.8693333333333335

In [136]:
with open('oatp_processed.txt', 'w') as out:
  out.write(process_oatp(oatp))

## Accessing the ORACC Dataset

In [None]:
!wget https://raw.githubusercontent.com/cdli-gh/data/master/cdliatf_unblocked.atf

--2021-07-18 01:41:47--  https://raw.githubusercontent.com/cdli-gh/data/master/cdliatf_unblocked.atf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 83581625 (80M) [text/plain]
Saving to: ‘cdliatf_unblocked.atf’


2021-07-18 01:41:50 (38.0 MB/s) - ‘cdliatf_unblocked.atf’ saved [83581625/83581625]



In [None]:
!ls

cdliatf_unblocked.atf	  fontlist-ugly.txt   OATP_unpublished.txt
CombineSampleTexts.ipynb  OATP_published.txt  README.md


In [None]:
#create necessary directories
import os
os.makedirs('jsonzip', exist_ok=True)
os.makedirs('output', exist_ok=True)

# import necessary libraries
import pandas as pd
from tqdm.auto import tqdm

# import libraries for this section
import requests
import zipfile
import json

In [None]:
project = 'epsd2/admin/ur3' #define project to download.

CHUNK = 1024 #define chunk size

proj = project.replace('/', '-')
url = "http://build-oracc.museum.upenn.edu/json/%s.zip" % proj
file_name = "jsonzip/%s.zip" % proj
with requests.get(url, stream=True) as request:
    if request.status_code == 200:
        #if file is accessible, download
        tqdm.write('Saving ' + url + ' as ' + file_name)
        total_size = int(request.headers.get('content-length', 0))
        tqdm_handler = tqdm(total=total_size, unit='B', unit_scale=True, desc = project)
        with open(file_name, 'wb') as zip_file:
        #use tdqm to show download speed
            for chunk in request.iter_content(chunk_size=CHUNK):
                tqdm_handler.update(len(chunk))
                zip_file.write(chunk)
    else:
        tqdm.write("WARNING: %s does not exist." % url)

Saving http://build-oracc.museum.upenn.edu/json/epsd2-admin-ur3.zip as jsonzip/epsd2-admin-ur3.zip


HBox(children=(FloatProgress(value=0.0, description='epsd2/admin/ur3', max=630772632.0, style=ProgressStyle(de…

In [None]:
result = requests.get('http://oracc.org/projects.json')
result

<Response [200]>

In [None]:
print(result.json())

{'type': 'projects', 'public': ['adsd', 'adsd/adart1', 'adsd/adart2', 'adsd/adart3', 'adsd/adart6', 'aemw', 'aemw/alalakh/idrimi', 'aemw/amarna', 'aemw/ugarit', 'akklove', 'amgg', 'ario', 'armep', 'arrim', 'asbp', 'asbp/ninmed', 'asbp/rlasb', 'atae', 'atae/assur', 'atae/burmarina', 'atae/durkatlimmu', 'atae/guzana', 'atae/huzirina', 'atae/imgurenlil', 'atae/kalhu', 'atae/mallanate', 'atae/marqasu', 'atae/nineveh', 'atae/samal', 'atae/szibaniba', 'atae/tilbarsip', 'blms', 'btto', 'cams', 'cams/akno', 'cams/anzu', 'cams/barutu', 'cams/etana', 'cams/gkab', 'cams/ludlul', 'cams/selbi', 'cams/tlab', 'caspo', 'caspo/akkpm', 'ccpo', 'cdli', 'ckst', 'cmawro', 'cmawro/cmawr1', 'cmawro/cmawr2', 'cmawro/cmawr3', 'cmawro/maqlu', 'contrib', 'contrib/amarna', 'contrib/lambert', 'ctij', 'dcclt', 'dcclt/ebla', 'dcclt/jena', 'dcclt/nineveh', 'dcclt/signlists', 'dccmt', 'doc', 'dsst', 'ecut', 'epsd2/issl', 'etcsri', 'glass', 'hbtin', 'issl', 'lacost', 'lovelyrics', 'neo', 'nimrud', 'obmc', 'obta', 'ogsl

In [None]:
rimanum = requests.get('http://oracc.org/rimanum/corpus.json')

In [None]:
rimanum.content

b''