#Translations Extraction

Notebook by Melinee Her

Exports translated texts from ORACC projects.

Here is an example of a translated text and its corresponding url:

http://oracc.iaas.upenn.edu/adsd/adart5/X500001/html

# Mount Google Drive folder + imports + steps

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#any necessary imports
import pandas as pd
import zipfile
from zipfile import ZipFile
import json
import requests
from tqdm import tqdm
import os
import errno
import re
import random
import numpy as np
import sys
import copy
import networkx as nx
from pathlib import Path
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen

#Set folder for remote drive
#folder = '/content/drive/My Drive/FactGrid Cuneiform (AWCA)/people/Melinee'
folder = '/content/drive/MyDrive/Melinee/'

#importing utils for the method which downloads the current text json files
os.chdir(folder + 'network/utils/')
from utils import oracc_download

# This is a user defined module that searches through the texts to find the entities in the text that
# are people and places, to be imported as nodes into the network
os.chdir(folder + 'network/')
import rank_parser4 as rp

The steps:
1. retrieve the megacatalogue
2. access texts with a url in a skmilar form to: 'http://oracc.iaas.upenn.edu/[project]/[id_text]/html'
3. analyze the text and make sure it is non empty
4. export files to ORACC_TRANLSATIONS with a translation as a txt file.

        expected to be fine:
        'aemw/amarna', 'adsd/adart1', 'adsd/adart2','adsd/adart3',
        'aemw/alalakh/idrimi', 'dcclt/nineveh', 'dcclt/signlists',
        'blms','btto', 'eisl', 'dcclt','cams/barutu', 'dccmt',
        'asbp/ninmed', 'glass', 'adsd/adart6', 'akklove',
        'adsd/adart5', 'asbp/rlasb', 'ario', 'dcclt/ebla', 'obmc',
        'tcma/assur', 'ckst', 'cams/gkab', 'tcma/kulishinas',
        'tcma/miscellaneous','tcma/nippur', 'tcma/laws', 'tcma/kartn',
        'tcma/hatti','tcma/bazmusian', 'tcma/tsh1', 'tcma/billa',
        'tcma/rimah','tcma/giricano', 'tcma/fekheriye', 'tcma/nuzi',
        'tcma/chuera','tcma/hana', 'riao', 'etcsri', 'rinap/rinap4',
        'rinap/rinap1', 'rinap/rinap3', 'rinap/rinap5', 'ribo/babylon10',
        'ribo/babylon7', 'suhu', 'ribo/babylon2', 'ribo/babylon3',
        'ribo/babylon4', 'ribo/babylon5', 'rinap/rinap2',  'ribo/babylon6',
        'ecut', 'saao/saa03', 'tcma/ali1', 'tcma/suri', 'tcma/haradum',
        'tcma/qitar', 'tcma/barri', 'tcma/emar', 'tcma/taban', 'tcma/tsa1',
        'saao', 'hbtin', 'dsst', 'ctij', 'obta', 'rimanum',
        'saao/saa04', 'tcma/ugarit', 'tcma/amarna','tcma/nineveh'

        likely no print text or eng translation:
        'asbp','rinap/sources', 'cams/ludlul', 'cams/etana',
        'cdli', 'ribo/sources', 'cams', 'cams/absumu',
        'epsd2/admin/ed3a', 'epsd2/admin/ed12', 'epsd2/royal',
        'epsd2/admin/ed3b', 'epsd2/earlylit',
        'epsd2/praxis', 'epsd2/admin/oakk', 'epsd2/admin/ur3',
        'epsd2/admin/lagash2', 'epsd2/admin/oldbab', 'epsd2/praxis/varia',
        'epsd2/literary', 'epsd2/praxis/liturgy', 'rinap/rinap5p1',
        'qcat', 'xcat'

        multiproject (difficult to scrape):
        'aremp', 'cmawro'

        complicated translations:
        for 'ccpo' the form is: https://ccp.yale.edu/P461274
        for 'caspp' the form is: http://oracc.museum.upenn.edu/caspo/P413957/html
            within class='tabcontent' under pre style
        for 'caspo/akkpm':  http://oracc.museum.upenn.edu/caspo/akkpm/P269974/html
        ````within class='tabcontent' under pre style
        for 'obel' the form is:
        http://oracc.museum.upenn.edu/obel//P342882/html
        ````(different url and double forward slash before id)
        for 'babcity' the form is:
        http://oracc.museum.upenn.edu/babcity//P531001/html
        ````(different url and double forward slash before id)
        for 'borsippa' the form is:
        http://oracc.iaas.upenn.edu/borsippa//P521512/html
        ````(same url but double forward slash before id)
        for 'lacost':
          translations not formatted as table
        for 'nere' the form is:
        http://oracc.museum.upenn.edu/nere//Q009326/html
        ````(different url and double forward slash before id)

        fix for atae/:
        'tilbarsip','assurmisc', 'stat2', 'stat1',
        'wvdog152', 'stat3', 'rfdn17','saab0509', 'tuszhan',
        'szibaniba', 'marqasu', 'guzana','imgurenlil',
        'durkatlimmu', 'mallanate', 'huzirina', 'burmarina',
        'ctn2', 'ctn3', 'ctn1',  'ctn6', 'edubba10', 'samal'

        fix for saao/:
        'saa03','saa20', 'saas2', 'saa12', 'saa13', 'saa02',
        'saa14', 'saa18', 'saa21', 'saa08', 'saa17', 'saa04',
        'saa10', 'saa15','saa06', 'saa11', 'saa16', 'saa07',
        'saa01', 'saa05', 'saa09','saa19'

Retrieving the megacatalogue from drive

In [3]:
path = '/content/drive/MyDrive/Melinee/ORACC_DFS/megacatalogue.csv'
megacatalogue = pd.read_csv(path, low_memory=False, index_col=False)

In [None]:
megacatalogue.head(3)

Unnamed: 0.1,Unnamed: 0,id_text,langs,project,id_text.1,primary_publication,provenience,pleiades_id,pleiades_coord,excavation_no,...,Q_places,parallels,new_q,subseries,subseries_section,description,oracc_id,sec1,chap,sec2
0,0,P522592,0x08000000,tilbarsip,P522592,Til-Barsip 01,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 01,...,,,,,,,,,,
1,1,P522593,0x08000000,tilbarsip,P522593,Til-Barsip 02,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 02,...,,,,,,,,,,
2,2,P522594,0x08000000,tilbarsip,P522594,Til-Barsip 03,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 03,...,,,,,,,,,,


In [4]:
#These two lines of code are for viewing unique projects and their corresponding texts
#megacatalogue['project'].unique()
#megacatalogue[megacatalogue['project'] == 'tcma/nineveh']['id_text'].unique()

fix_atae = ['tilbarsip','assurmisc', 'stat2', 'stat1', 'wvdog152', 'stat3', 'rfdn17','saab0509', 'tuszhan',
            'szibaniba', 'marqasu', 'guzana','imgurenlil', 'durkatlimmu', 'mallanate', 'huzirina', 'burmarina',
            'ctn2', 'ctn3', 'ctn1',  'ctn6', 'edubba10', 'samal']

fix_saao = ['saa03','saa20', 'saas2', 'saa12', 'saa13', 'saa02', 'saa14', 'saa18', 'saa21', 'saa08', 'saa17', 'saa04',
            'saa10', 'saa15','saa06', 'saa11', 'saa16', 'saa07', 'saa01', 'saa05', 'saa09','saa19']

#add new column 'project_name' that duplicates 'project'
megacatalogue['project_name'] = megacatalogue['project']

#replace all in atae and saao list as main+sub
for i in fix_atae:
  megacatalogue['project_name'] = megacatalogue['project_name'].replace(i, 'atae/'+i)

for i in fix_saao:
  megacatalogue['project_name'] = megacatalogue['project_name'].replace(i, 'saao/'+i)

megacatalogue.shape

(171145, 406)

##Automate Extraction of Texts

In [None]:
#5-6 hour program
#changes directory to English Translations folder
os.chdir(folder + 'ORACC_TRANSLATIONS/ENG_TRANSLATIONS')

df = megacatalogue

for i in range(df.shape[0]):
  proj = df['project_name'][i]
  dash_proj = proj.replace("/", "-")
  id_txt = df['id_text'][i]
  print(dash_proj, id_txt)  #to keep track of progress

  try:
    url = urlopen('https://ccp.yale.edu/' + id_txt)
  except:
    url = urlopen('http://oracc.museum.upenn.edu/' + proj + '/' + id_txt + '/html')
  else:
    url = urlopen('http://oracc.iaas.upenn.edu/' + proj + '/' + id_txt + '/html')

  html = url.read()
  soup = BeautifulSoup(html, 'html.parser')

  translation = soup.find_all('p', class_ = 'tr')
  transtext = ''
  for string in translation:
    transtext += string.get_text()

  #translation
  if len(transtext) != 0:
    txt = open(dash_proj + '-' + id_txt + '.txt', "w")
    txt.write(transtext)
    txt.close()

atae-tilbarsip P522592
atae-tilbarsip P522593
atae-tilbarsip P522594
atae-tilbarsip P522595
atae-tilbarsip P522596
atae-tilbarsip P522597
atae-tilbarsip P522598
atae-tilbarsip P522599
atae-tilbarsip P522600
atae-tilbarsip P522601
atae-tilbarsip P522602
atae-tilbarsip P522603
atae-tilbarsip P522604
atae-tilbarsip P522605
atae-tilbarsip P522606
atae-tilbarsip P522607
atae-tilbarsip P522608
atae-tilbarsip P522609
atae-tilbarsip P522610
atae-tilbarsip P522611
atae-tilbarsip P522612
atae-tilbarsip P522613
aemw-amarna P270838
aemw-amarna P270839
aemw-amarna P270840
aemw-amarna P270841
aemw-amarna P270842
aemw-amarna P270843
aemw-amarna P270844
aemw-amarna P270845
aemw-amarna P270846
aemw-amarna P270847
aemw-amarna P270848
aemw-amarna P270849
aemw-amarna P270850
aemw-amarna P270851
aemw-amarna P270853
aemw-amarna P270854
aemw-amarna P270855
aemw-amarna P270856
aemw-amarna P270857
aemw-amarna P270858
aemw-amarna P270859
aemw-amarna P270861
aemw-amarna P270862
aemw-amarna P270863
aemw-amarna P2

##EXAMPLE: Get the English Translation of a Single Text

In [10]:
#changes directory to English Translations folder
os.chdir(folder + 'ORACC_TRANSLATIONS')

#given a url, parse through and create a soup of html text
url = urlopen('http://oracc.iaas.upenn.edu/adsd/adart5/X500001/html')
html = url.read()
soup = BeautifulSoup(html, 'html.parser')

#create a variable 'soupy' that contains all elements of the p class translations
#using a for loop, get all texts. *get_text() only works for single element
soupy = soup.find_all()
soupy = soup.find_all('p', class_ = 'tr')
text = ''
text_list = []
for string in soupy:
  #print(string.get_text())
  text += string.get_text()
  text_list.append(string.get_text())

text_list

['(o 0) At the command of Bēl and Bēltiya may it go well.',
 '(o 1) 1.40. Accession year [of ...]',
 '(o 2) Month XII, (after) 5 months, the 14th, morning watch, it made (an eclipse)? ... [...]',
 '(o 3) 2.10. Year 1. Month VI, [the] ⸢15th?⸣, it made (an eclipse)?. It began in the north [...]',
 '(o 4) [...] the south wind blew. It set eclipsed. Month VI was in[tercalary.]',
 '(o 5) [Month XI, the 1]4th, it made (an eclipse)?. 1.40° re[mained?] to clearing.',
 '(o 6) [Year 2. Month] V, the 14th, it made a total (eclipse).',
 '(o 7) [Month XI,] omitted.',
 '(o 8) [Year 3. Month V, omitt]ed. Month VI was intercalary.',
 '(o 9) [...] total? [...]',
 '(o 10) [...] ... [...]']

In [None]:
#uploads the text into google drive as a txt file
txt = open("adsd_adart5_X500001.txt", "w")
txt.write(text)
txt.close()

## Just for ETCSRI


In [None]:
#changes directory to English Translations folder
os.chdir(folder + 'ORACC_TRANSLATIONS/ENG_TRANSLATIONS')

project = megacatalogue[megacatalogue['project_name'] == 'etcsri']

for i in project['id_text']:
  proj = 'etcsri'
  dash_proj = proj.replace("/", "-")

  try:
    url = urlopen('http://oracc.iaas.upenn.edu/' + proj + '/' + i + '/html')
    html = url.read()
    soup = BeautifulSoup(html, 'html.parser')

    soupy = soup.find_all('p', class_ = 'tr')
    text = ''
    for string in soupy:
      text += string.get_text()

    if len(text) != 0:
      txt = open(dash_proj + '-' + i + '.txt', "w")
      txt.write(text)
      txt.close()
  except:
    continue;

Q000376 etcsri
Q000377 etcsri
Q000613 etcsri
Q000834 etcsri
Q000835 etcsri
Q000842 etcsri
Q000844 etcsri
Q000871 etcsri
Q000872 etcsri
Q000873 etcsri
Q000874 etcsri
Q000876 etcsri
Q000877 etcsri
Q000879 etcsri
Q000881 etcsri
Q000882 etcsri
Q000883 etcsri
Q000884 etcsri
Q000885 etcsri
Q000886 etcsri
Q000887 etcsri
Q000888 etcsri
Q000889 etcsri
Q000890 etcsri
Q000891 etcsri
Q000892 etcsri
Q000893 etcsri
Q000894 etcsri
Q000895 etcsri
Q000896 etcsri
Q000897 etcsri
Q000898 etcsri
Q000899 etcsri
Q000900 etcsri
Q000901 etcsri
Q000902 etcsri
Q000903 etcsri
Q000904 etcsri
Q000905 etcsri
Q000906 etcsri
Q000907 etcsri
Q000908 etcsri
Q000909 etcsri
Q000910 etcsri
Q000911 etcsri
Q000912 etcsri
Q000913 etcsri
Q000914 etcsri
Q000915 etcsri
Q000916 etcsri
Q000917 etcsri
Q000918 etcsri
Q000919 etcsri
Q000920 etcsri
Q000921 etcsri
Q000922 etcsri
Q000923 etcsri
Q000924 etcsri
Q000925 etcsri
Q000926 etcsri
Q000927 etcsri
Q000928 etcsri
Q000929 etcsri
Q000930 etcsri
Q000931 etcsri
Q000932 etcsri
Q000933 et