#Megacatalogue

Notebook by Melinee Her

Creates a large DataFrame of all project catalogues. Adds a column with the umbrella folder name to keep track of origin.


# Mount Google Drive folder + imports + steps

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#any necessary imports
import pandas as pd
import zipfile
from zipfile import ZipFile
import json
import requests
from tqdm import tqdm
import os
import errno
import re
import random
import numpy as np
import sys
import copy
import networkx as nx
from pathlib import Path

#Set folder for remote drive
#folder = '/content/drive/My Drive/FactGrid Cuneiform (AWCA)/people/Melinee'
folder = '/content/drive/MyDrive/Melinee/'

#importing utils for the method which downloads the current text json files
os.chdir(folder + 'network/utils/')
from utils import oracc_download

# This is a user defined module that searches through the texts to find the entities in the text that
# are people and places, to be imported as nodes into the network
os.chdir(folder + 'network/')
import rank_parser4 as rp

The steps:
1. Use the directory of the dataframes created called ORACC_PROJECT_DFS to access the df_cat for all projects
2. Merge all dataframes into a large dataframe
3. Sort by projects

##List of all Projects

In [None]:
#from COLAB "DataFrame Extraction", a list of all projects in the ORACC_PROJECT_DFS folder
#the projects with some empty files: ario, ctij, cmawro/cmawr3, ecut, lacost, saao/saa04, saao/saa07
#projects alphabetical, starts with 'a'
p1 = ['adsd','adsd/adart1','adsd/adart2','adsd/adart3','adsd/adart5','adsd/adart6','aemw/alalakh/idrimi','aemw/amarna','akklove', 'ario',
      'armep','asbp','asbp/ninmed','asbp/rlasb','atae','atae/assur','atae/burmarina','atae/durkatlimmu',
      'atae/guzana','atae/huzirina','atae/imgurenlil','atae/kalhu','atae/mallanate','atae/marqasu',
      'atae/nineveh','atae/samal','atae/szibaniba','atae/tilbarsip','atae/tuszhan']


#projects alphabetical, starts with 'b' through 'e'
p2 = ['babcity','blms','borsippa','btmao','btto','cams','cams/akno','cams/anzu','cams/barutu','cams/etana','cams/ludlul',
      'cams/selbi','cams/tlab','ckst','cmawro','cmawro/cmawr1','cmawro/cmawr2','cmawro/cmawr3', 'cmawro/maqlu','contrib/amarna', 'ctij',
      'dcclt','dcclt/ebla','dcclt/jena','dcclt/nineveh','dcclt/signlists','dccmt','dsst', 'ecut', 'eisl','epsd2','etcsri',]


#projects alphabetical, 'g' through 'r'
p3 = ['glass','hbtin','lacost','nere','obel','obmc','obta','oimea','qcat','riao',
      'ribo','ribo/bab7scores','ribo/babylon10','ribo/babylon2','ribo/babylon3','ribo/babylon4','ribo/babylon5','ribo/babylon6',
      'ribo/babylon7','ribo/babylon8','ribo/sources','rimanum','rinap','rinap/rinap1','rinap/rinap2','rinap/rinap3',
      'rinap/rinap4','rinap/rinap5','rinap/rinap5p1','rinap/scores','rinap/sources',]


#projects alphabetical, 's' through 'x'
p4 = ['saao','saao/aebp','saao/knpp','saao/saa01','saao/saa02','saao/saa03','saao/saa04','saao/saa05','saao/saa06', 'saao/saa07','saao/saa08',
      'saao/saa09','saao/saa10','saao/saa11','saao/saa12','saao/saa13','saao/saa14','saao/saa15','saao/saa16','saao/saa17',
      'saao/saa18','saao/saa19','saao/saa20','saao/saa21','saao/saas2','suhu','tcma','tsae','xcat']

#the buried projects
p5 = ['epsd2/earlylit', 'epsd2/literary', 'epsd2/praxis', 'epsd2/praxis/liturgy','epsd2/admin/ed12', 'epsd2/admin/ed3b', 'epsd2/admin/lagash2',
      'epsd2/admin/oakk', 'epsd2/admin/oldbab', 'epsd2/admin/ur3']

p6 =  ["tcma/ali1","tcma/amarna","tcma/assur","tcma/barri","tcma/bazmusian","tcma/billa", "tcma/brak","tcma/chuera","tcma/emar",
      "tcma/fekheriye","tcma/giricano","tcma/hana","tcma/haradum","tcma/hatti","tcma/kalhu","tcma/kartn","tcma/kulishinas",
      "tcma/miscellaneous","tcma/nineveh","tcma/nippur","tcma/nuzi","tcma/qitar","tcma/rimah","tcma/suri",
      "tcma/taban","tcma/tsa1","tcma/tsh1","tcma/ugarit"]

pfinal = p1+p2+p3+p4+p5+p6

# Final DataFrame


In [None]:
general_path = '/content/drive/MyDrive/Melinee/ORACC_DFS/PROJECT_DFS/'
pluscsv = '-df-cat.csv'

#this function is also in colab 3-Large DF of Unique Words
#however, I modified this one to add a column with the project name
def createlargedf(lst):
  dash_proj = [word.replace("/", "-") for word in lst]
  temp = pd.DataFrame()
  for i in range(len(lst)):
    print(lst[i])                   #uncomment this line to see progress as runs
    path = general_path + dash_proj[i] + pluscsv
    if i == 0:
      temp = pd.read_csv(path, low_memory=False, index_col=False)
    else:
      df = pd.read_csv(path, low_memory=False, index_col=False)
      temp = pd.concat([temp, df], ignore_index = True).drop_duplicates(subset=['id_text'])
  return temp

In [None]:
#concatenates the entire dataset, ~16 minute runtime
megacatalogue = createlargedf(pfinal)

adsd
adsd/adart1
adsd/adart2
adsd/adart3
adsd/adart5
adsd/adart6
aemw/alalakh/idrimi
aemw/amarna
akklove
ario
armep
asbp
asbp/ninmed
asbp/rlasb
atae
atae/assur
atae/burmarina
atae/durkatlimmu
atae/guzana
atae/huzirina
atae/imgurenlil
atae/kalhu
atae/mallanate
atae/marqasu
atae/nineveh
atae/samal
atae/szibaniba
atae/tilbarsip
atae/tuszhan
babcity
blms
borsippa
btmao
btto
cams
cams/akno
cams/anzu
cams/barutu
cams/etana
cams/ludlul
cams/selbi
cams/tlab
ckst
cmawro
cmawro/cmawr1
cmawro/cmawr2
cmawro/cmawr3
cmawro/maqlu
contrib/amarna
ctij
dcclt
dcclt/ebla
dcclt/jena
dcclt/nineveh
dcclt/signlists
dccmt
dsst
ecut
eisl
epsd2
etcsri
glass
hbtin
lacost
nere
obel
obmc
obta
oimea
qcat
riao
ribo
ribo/bab7scores
ribo/babylon10
ribo/babylon2
ribo/babylon3
ribo/babylon4
ribo/babylon5
ribo/babylon6
ribo/babylon7
ribo/babylon8
ribo/sources
rimanum
rinap
rinap/rinap1
rinap/rinap2
rinap/rinap3
rinap/rinap4
rinap/rinap5
rinap/rinap5p1
rinap/scores
rinap/sources
saao
saao/aebp
saao/knpp
saao/saa01
saao/saa

In [None]:
megacatalogue.head(3)

Unnamed: 0,id_text,langs,project,id_text.1,primary_publication,provenience,pleiades_id,pleiades_coord,excavation_no,archive,...,Q_places,parallels,new_q,subseries,subseries_section,description,oracc_id,sec1,chap,sec2
0,P522592,0x08000000,tilbarsip,P522592,Til-Barsip 01,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 01,001 - Hanni Archive (House C1),...,,,,,,,,,,
1,P522593,0x08000000,tilbarsip,P522593,Til-Barsip 02,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 02,001 - Hanni Archive (House C1),...,,,,,,,,,,
2,P522594,0x08000000,tilbarsip,P522594,Til-Barsip 03,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 03,001 - Hanni Archive (House C1),...,,,,,,,,,,


In [None]:
#megacatalogue[megacatalogue['project'] = 'tilbarsip']
#megacatalogue['project'].unique(), megacatalogue['project name'].unique()

megacatalogue[megacatalogue['project'] == 'saab0509'][['project name', 'project', 'id_text']]
#atae/tilbarsip, saao/saa03,

Unnamed: 0,project name,project,id_text
1493,adsd,saab0509,P528224
1494,adsd,saab0509,P528225
1495,adsd,saab0509,P528226
1496,adsd,saab0509,P528227
1497,adsd,saab0509,P528228
...,...,...,...
1632,adsd,saab0509,P528363
1633,adsd,saab0509,P528364
1634,adsd,saab0509,P528365
30032,adsd,saab0509,P528366


In [None]:
#downloads the megacatalogue to the folder ORACC_PROJECT_DFS
megacatalogue.to_csv(folder + 'ORACC_DFS/megacatalogue.csv')

##This workflow accomplished:

The creation of a large dataframe of ORACC catalogue data.

###Next Steps:

1. Resolving duplicate fields
  * lowercase all fields and merge duplicates (as long as there are no merge conflicts)
2. Resolving date fields
  * cross-validate & harmonize the date fields between ORACC projects:
  1. `ruler`
  2. `date_of_origin`
  3. `date`
  4. `long_date`
  5. `date_gen`
  6. `day`
  7. `long_date_gen`
  8. `month`
  9. `year`
  10. `eponym`
  11. `regnal_dates`
  12. `ancient_year`
  13. `date_bce`
  14. `months_recorded`
  15. `date_comments`
  16. `proposed_date`
  17. `eponym_title`
  18. `astron_date`
  19. `Reg_year`
  20. `Reg_no`
  21. `Ruler`
  22. `Day`
  23. `Month`
  24. `Year`
  25. `dynastic_seat`
  26. `date remarks`
  27. `year_name_eponym`
  28. `ancient_date`
  29. `century`
  30. `modern_converted_date`
  31. `accounting_period`
  * We can use this Chronology notebook to check against the CDLI dates: https://colab.research.google.com/drive/1ZYWIapSC6za-WJd6EOA7xjSm6o6CPKPu?usp=sharing

3. Harmonizing with the CDLI catalogue
  1. [GitHub repo cdli_cat.csv](https://github.com/cdli-gh/data/blob/master/cdli_cat.csv)
  * [Zenodo](https://zenodo.org/record/6975724) (should be same as above)
  2. Processed & cleaned CDLI catalogue subset: https://github.com/ancient-world-citation-analysis/CDLI2LoD

4. Formatting for LOD in FactGrid
  * Example from ORACC: http://oracc.museum.upenn.edu/epsd2/admin/ur3/P123456
    * Dates Referenced: SH44 - 01 - 26
    * SH = Šulgi

|id_text|Ruler|Year_number|Month|Day|Earliest_P41|Latest_P43|
|--|--|--|--|--|--|--|
|P123456|Šulgi|44|01|26|Earliest|Latest|

