# Separating ORRAC Projects by their Language
Notebook by Melinee Her

This notebook is split into two different parts:

1) Splitting the megacatalogue into separate language dataframes and identifying the projects that fall into these languages.

2) Splitting the finaldf (final wordsdf) into separate language dataframes. This will capture all of the words identified for a specific language/dialect.

The goal of this notebook is to categorize the ORACC projects by their different lanugages to assist the machine learning translation project.

# Mount Google Drive folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#any necessary imports
import pandas as pd
import zipfile
from zipfile import ZipFile
import json
import requests
from tqdm import tqdm
import os
import errno
import re
import random
import numpy as np
import sys
import copy
import networkx as nx
from pathlib import Path
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen

#Set folder for remote drive
#folder = '/content/drive/My Drive/FactGrid Cuneiform (AWCA)/people/Melinee'
folder = '/content/drive/MyDrive/Melinee/'

#importing utils for the method which downloads the current text json files
os.chdir(folder + 'network/utils/')
from utils import oracc_download

# This is a user defined module that searches through the texts to find the entities in the text that
# are people and places, to be imported as nodes into the network
os.chdir(folder + 'network/')
import rank_parser4 as rp

#Splitting the Megacatalogue by Language and Text

Retrieving the megacatalogue from drive

In [None]:
path = '/content/drive/MyDrive/Melinee/ORACC_DFS/megacatalogue.csv'
megacatalogue = pd.read_csv(path, low_memory=False, index_col=False)

In [None]:
#These two lines of code are for viewing unique projects and their corresponding texts

fix_atae = ['tilbarsip','assurmisc', 'stat2', 'stat1', 'wvdog152', 'stat3', 'rfdn17','saab0509', 'tuszhan',
            'szibaniba', 'marqasu', 'guzana','imgurenlil', 'durkatlimmu', 'mallanate', 'huzirina', 'burmarina',
            'ctn2', 'ctn3', 'ctn1',  'ctn6', 'edubba10', 'samal']

fix_saao = ['saa03','saa20', 'saas2', 'saa12', 'saa13', 'saa02', 'saa14', 'saa18', 'saa21', 'saa08', 'saa17', 'saa04',
            'saa10', 'saa15','saa06', 'saa11', 'saa16', 'saa07', 'saa01', 'saa05', 'saa09','saa19']

#add new column 'project_name' that duplicates 'project'
megacatalogue['project_name'] = megacatalogue['project']

#replace all in atae and saao list as main+sub
for i in fix_atae:
  megacatalogue['project_name'] = megacatalogue['project_name'].replace(i, 'atae/'+i)

for i in fix_saao:
  megacatalogue['project_name'] = megacatalogue['project_name'].replace(i, 'saao/'+i)

In [None]:
#drops columns that are all null + any unwanted columns
megacatalogue = megacatalogue.dropna(axis='columns', how='all')
megacatalogue = megacatalogue.drop(columns=['Unnamed: 0'])
megacatalogue.shape

(171145, 339)

In [None]:
megacatalogue.head(3)

Unnamed: 0,id_text,langs,project,id_text.1,primary_publication,provenience,pleiades_id,pleiades_coord,excavation_no,archive,...,deity,museum_URL,Delnero_remarks,Cohen_balag,external_URL_name,external_URL,google_earth_provenience,alternative_years,oracc_id,project_name
0,P522592,0x08000000,tilbarsip,P522592,Til-Barsip 01,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 01,001 - Hanni Archive (House C1),...,,,,,,,,,,atae/tilbarsip
1,P522593,0x08000000,tilbarsip,P522593,Til-Barsip 02,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 02,001 - Hanni Archive (House C1),...,,,,,,,,,,atae/tilbarsip
2,P522594,0x08000000,tilbarsip,P522594,Til-Barsip 03,Tell Ahmar (Til Barsip),658410.0,"[38.1191944, 36.6749623]",T 03,001 - Hanni Archive (House C1),...,,,,,,,,,,atae/tilbarsip


In [None]:
#The list of unique documented languages in the megacatalogue
languages = megacatalogue['Language'].unique()
languages

array([nan, 'Akkadian', 'Bilingual', 'bilingual', 'Bilingual?',
       'Sumerian', ';', 'Sumerian; Akkadian', 'Assyrian;Assyrian',
       'Aramaic', 'Sumerian?', 'Luwian', 'Akkadian;Aramaic',
       'Sumerian;Akkadian', ';;'], dtype=object)

In [None]:
#getting 15 dfs for each group

nans = megacatalogue[megacatalogue['Language'].isna()]
semicolon = megacatalogue[megacatalogue['Language'] == ';']
semicolon2 = aramaic = megacatalogue[megacatalogue['Language'] == ';;']

bil = megacatalogue[megacatalogue['Language'] == 'Bilingual']
bil2 = megacatalogue[megacatalogue['Language'] == 'bilingual']
bil_q = megacatalogue[megacatalogue['Language'] == 'Bilingual?']

sumer = megacatalogue[megacatalogue['Language'] == 'Sumerian']
sumer_q = megacatalogue[megacatalogue['Language'] == 'Sumerian?']

sumer_akk = megacatalogue[megacatalogue['Language'] == 'Sumerian; Akkadian']
sumer_akk2 = megacatalogue[megacatalogue['Language'] == 'Sumerian;Akkadian']

akk = megacatalogue[megacatalogue['Language'] == 'Akkadian']

akk_aramaic = megacatalogue[megacatalogue['Language'] == 'Akkadian;Aramaic']
aramaic = megacatalogue[megacatalogue['Language'] == 'Aramaic']
assyrian_assyrian = megacatalogue[megacatalogue['Language'] == 'Assyrian;Assyrian']
luwain = megacatalogue[megacatalogue['Language'] == 'Luwian']


In [None]:
#uncomment any line to display any of the raw dfs
#nans
#semicolon
#semicolon2
#bil
#bil2
#bil_q
#sumer
#sumer_q
#sumer_akk
#sumer_akk2
#akk
#akk_aramaic
#aramaic
#assyrian_assyrian
#luwain

## Merging the Raw dfs into Larger Language dfs
I have separated the megacatalogue into the following categories:

1) Unspecified: [nan, ';' , ';;']

2) Bilingual: ['Bilingual', 'bilingual', 'Bilingual?']

3) Sumerian: ['Sumerian', 'Sumerian?']

4) Sumerian_Akkadian: ['Sumerian; Akkadian', 'Sumerian;Akkadian']

5) Addadian: ['Akkadian']

6) Others: ['Assyrian;Assyrian', 'Aramaic', 'Luwian', 'Akkadian;Aramaic']

The following code will concatenate the associated dataframes into their category and create a new column 'Category' in each df with a label for their category. Therefore, maintaining their original language specification.

Projects in multiple categories means that the texts in that project have different languages and/or have missing language specification. We can specify which texts by probing into the dataframe and identifying the unique "id_text"s.

In [None]:
unspecified = pd.concat([nans, semicolon, semicolon2])
unspecified['Category'] = 'unspecified'
bilingual = pd.concat([bil,bil2,bil_q])
bilingual['Category'] = 'Bilingual'
sumerian = pd.concat([sumer,sumer_q])
sumerian['Category'] = 'Sumerian'
sumerian_akkadian = pd.concat([sumer_akk, sumer_akk2])
sumerian_akkadian['Category'] = 'Summerian_Akkadian'
akkadian = akk.copy()
akkadian['Category'] = 'Akkadian'
others = pd.concat([assyrian_assyrian, aramaic, luwain, akk_aramaic])
others['Category'] = 'others'

In [None]:
#The projects associated in each category can be seen at a glance here
print(unspecified['project_name'].unique())
print(bilingual['project_name'].unique())
print(sumerian['project_name'].unique())
print(sumerian_akkadian['project_name'].unique())
print(akkadian['project_name'].unique())
print(others['project_name'].unique())

['atae/tilbarsip' 'aemw/amarna' 'adsd/adart1' 'adsd/adart2' 'adsd/adart3'
 'aemw/alalakh/idrimi' 'saao/saa03' 'atae/assurmisc' 'saao/saa20'
 'saao/saas2' 'saao/saa12' 'saao/saa13' 'saao/saa02' 'atae/stat2'
 'atae/stat1' 'atae/wvdog152' 'atae/stat3' 'atae/rfdn17' 'atae/saab0509'
 'atae/tuszhan' 'saao/saa14' 'saao/saa18' 'saao/saa21' 'saao/saa08'
 'saao/saa17' 'saao/saa04' 'saao/saa10' 'saao/saa15' 'saao/saa06'
 'saao/saa11' 'saao/saa16' 'saao/saa07' 'saao/saa01' 'saao/saa05'
 'saao/saa09' 'asbp' 'dcclt/nineveh' 'caspo' 'dcclt/signlists' 'btto'
 'blms' 'eisl' 'ccpo' 'cams/barutu' 'dccmt' 'dcclt' 'asbp/ninmed' 'glass'
 'cdli' 'rinap/sources' 'cams/ludlul' 'cams/etana' 'caspo/akkpm'
 'adsd/adart6' 'atae/szibaniba' 'atae/marqasu' 'akklove' 'atae/ctn2'
 'atae/ctn3' 'atae/guzana' 'adsd/adart5' 'atae/imgurenlil'
 'atae/durkatlimmu' 'saao/saa19' 'atae/ctn1' 'atae/mallanate' 'atae/ctn6'
 'atae/huzirina' 'atae/edubba10' 'atae/burmarina' 'atae/samal'
 'asbp/rlasb' 'ario' 'armep' 'dcclt/ebla' 'obmc

#Splitting the Finaldf by Language and Word

Note that the finaldf does not include the name of the project a word or text came from. Alongside this, it is likely that every project/subproject has texts in more than one language. Therefore, whether it would be useful to label a project with its languages can be contested.

In [None]:
path = '/content/drive/MyDrive/Melinee/ORACC_DFS/finaldf.csv'
final = pd.read_csv(path, low_memory=False, index_col=False)

In [None]:
final = final.dropna(axis = 'columns', how = 'all')
final = final.drop(columns=['Unnamed: 0'])
print(final.shape)
final.head(3)

(7271149, 22)


Unnamed: 0,lang,form,id_word,label,id_text,gdl,pos,cf,gw,sense,...,headform,contrefs,norm0,base,morph,stem,cont,syntax_ub-after,morph2,aform
0,arc,mmxx,P522613.2.1,o 1,P522613,,,,,,...,,,,,,,,,,
1,arc,t,P522613.2.2,o 1,P522613,,,,,,...,,,,,,,,,,
2,arc,rmyt,P522613.2.3,o 1,P522613,,,,,,...,,,,,,,,,,


In [None]:
languages = final['lang'].unique()
languages.sort()
languages

array(['akk', 'akk-935', 'akk-936', 'akk-949', 'akk-x-earakk',
       'akk-x-ltebab', 'akk-x-mbperi', 'akk-x-mbperi-949', 'akk-x-midass',
       'akk-x-midbab', 'akk-x-neoass', 'akk-x-neobab', 'akk-x-neobab-949',
       'akk-x-oldakk', 'akk-x-oldass', 'akk-x-oldbab', 'akk-x-oldbab-949',
       'akk-x-stdbab', 'akk-x-stdbab-949', 'arc', 'arc-949', 'egy-020',
       'elx', 'grc', 'hit', 'hit-946', 'hlu', 'peo', 'qca', 'qcu',
       'qcu-949', 'qeb', 'qpc', 'qur', 'sux', 'sux-947', 'sux-x-emesal',
       'sux-x-gloss', 'sux-x-syll', 'uga', 'uga-040', 'xhu', 'xhu-946',
       'xur', 'xur-944', 'xur-946'], dtype=object)

In [None]:
akk = final[final['lang'] == 'akk']
akk_935 = final[final['lang'] == 'akk-935']
akk_936 = final[final['lang'] == 'akk-936']
akk_949 = final[final['lang'] == 'akk-949']
akk_x_earakk = final[final['lang'] == 'akk-x-earakk']
akk_x_ltebab = final[final['lang'] == 'akk-x-ltebab']
akk_x_mbperi = final[final['lang'] == 'akk-x-mbperi']
akk_x_mbperi_949 = final[final['lang'] == 'akk-x-mbperi-949']
akk_x_midass = final[final['lang'] == 'akk-x-midass']
akk_x_midbab = final[final['lang'] == 'akk-x-midbab']
akk_x_neoass = final[final['lang'] == 'akk-x-neoass']
akk_x_neobab = final[final['lang'] == 'akk-x-neobab']
akk_x_neobab_949 = final[final['lang'] == 'akk-x-neobab-949']
akk_x_oldakk = final[final['lang'] == 'akk-x-oldass']
akk_x_oldass = final[final['lang'] == 'akk-x-oldass']
akk_x_oldbab = final[final['lang'] == 'akk-x-oldbab']
akk_x_oldbab_949 = final[final['lang'] == 'akk-x-oldbab-949']
akk_x_stdbab = final[final['lang'] == 'akk-x-stdbab']
akk_x_stdbab_949 = final[final['lang'] == 'akk-x-stdbab-949']

arc = final[final['lang'] == 'arc']
arc_949 = final[final['lang'] == 'arc-949']

egy_020 = final[final['lang'] == 'egy-020']
elx = final[final['lang'] == 'elx']
grc = final[final['lang'] == 'grc']

hit = final[final['lang'] == 'hit']
hit_946 = final[final['lang'] == 'hit-946']

hlu = final[final['lang'] == 'hlu']
peo = final[final['lang'] == 'peo']
qca = final[final['lang'] == 'qca']

qcu = final[final['lang'] == 'qcu']
qcu_949 = final[final['lang'] == 'qcu-949']

qeb = final[final['lang'] == 'qeb']
qpc = final[final['lang'] == 'qpc']
qur = final[final['lang'] == 'qur']

sux = final[final['lang'] == 'sux']
sux_947 = final[final['lang'] == 'sux-947']
sux_x_emesal = final[final['lang'] == 'sux-x-emesal']
sux_x_gloss = final[final['lang'] == 'sux-x-gloss']
sux_x_syll = final[final['lang'] == 'sux-x-syll']

uga = final[final['lang'] == 'uga']
uga_040 = final[final['lang'] == 'uga-040']

xhu = final[final['lang'] == 'xhu']
xhu_946 = final[final['lang'] == 'xhu-946']

xur = final[final['lang'] == 'xur']
xur_944 = final[final['lang'] == 'xur_944']
xur_946 = final[final['lang'] == 'xur_946']

In [None]:
akk

Unnamed: 0,lang,form,id_word,label,id_text,gdl,pos,cf,gw,sense,...,headform,contrefs,norm0,base,morph,stem,cont,syntax_ub-after,morph2,aform
19374,akk,x,X201711.2.1,o 1',X201711,"[{'x': 'ellipsis', 'id': 'X201711.2.1.0', 'bre...",u,,,,...,,,,,,,,,,
19375,akk,x,X201711.2.2,o 1',X201711,"[{'x': 'ellipsis', 'id': 'X201711.2.2.0', 'bre...",u,,,,...,,,,,,,,,,
19376,akk,x,X201711.3.1,o 2',X201711,"[{'x': 'ellipsis', 'id': 'X201711.3.1.0', 'bre...",u,,,,...,,,,,,,,,,
19377,akk,DIR,X201711.3.2,o 2',X201711,"[{'gg': 'logo', 'gdl_type': 'logo', 'group': [...",N,erpetu,cloud,cloud,...,,,,,,,,,,
19378,akk,AN,X201711.3.3,o 2',X201711,"[{'gg': 'logo', 'gdl_type': 'logo', 'group': [...",N,šamû,sky,sky,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7127369,akk,ni-nu-u₂-a,P503119.8.2,o 6,P503119,"[{'v': 'ni', 'id': 'P503119.8.2.0', 'break': '...",X,,,,...,,,,,,,,,,
7130316,akk,a-na,P504259.8.2,o 6,P504259,"[{'v': 'a', 'id': 'P504259.8.2.0', 'delim': '-...",X,,,,...,,,,,,,,,,
7130317,akk,erin₂,P504259.8.3,o 6,P504259,"[{'v': 'erin₂', 'id': 'P504259.8.3.0'}]",X,,,,...,,,,,,,,,,
7130318,akk,dam-qum,P504259.8.4,o 6,P504259,"[{'v': 'dam', 'id': 'P504259.8.4.0', 'delim': ...",X,,,,...,,,,,,,,,,


Next steps include:

1) merging some of these dfs into bigger dfs (e.g. all Akkadian related texts in a df called Akkadian)

2) exporting these dfs as csv files.

In [None]:
#merge all akkadian texts
akk_dfs = [akk, akk_935, akk_936, akk_949, akk_x_earakk, akk_x_ltebab, akk_x_mbperi, akk_x_mbperi_949, akk_x_midass, akk_x_midbab, akk_x_neoass, akk_x_neobab,
           akk_x_neobab_949, akk_x_oldakk, akk_x_oldass, akk_x_oldbab, akk_x_oldbab_949, akk_x_stdbab, akk_x_stdbab_949]

akkadians = pd.concat(akk_dfs, ignore_index=True)
akkadians

Unnamed: 0,lang,form,id_word,label,id_text,gdl,pos,cf,gw,sense,...,headform,contrefs,norm0,base,morph,stem,cont,syntax_ub-after,morph2,aform
0,akk,x,X201711.2.1,o 1',X201711,"[{'x': 'ellipsis', 'id': 'X201711.2.1.0', 'bre...",u,,,,...,,,,,,,,,,
1,akk,x,X201711.2.2,o 1',X201711,"[{'x': 'ellipsis', 'id': 'X201711.2.2.0', 'bre...",u,,,,...,,,,,,,,,,
2,akk,x,X201711.3.1,o 2',X201711,"[{'x': 'ellipsis', 'id': 'X201711.3.1.0', 'bre...",u,,,,...,,,,,,,,,,
3,akk,DIR,X201711.3.2,o 2',X201711,"[{'gg': 'logo', 'gdl_type': 'logo', 'group': [...",N,erpetu,cloud,cloud,...,,,,,,,,,,
4,akk,AN,X201711.3.3,o 2',X201711,"[{'gg': 'logo', 'gdl_type': 'logo', 'group': [...",N,šamû,sky,sky,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2201918,akk-x-stdbab-949,*,P247829.11.2,o 7',P247829,,N,kipkippu,(a bird of prey),(a bird of prey),...,,,,,,,,,,
2201919,akk-x-stdbab-949,*,P247829.12.2,o 8',P247829,,N,kurkurru,(a bird),(a bird),...,,,,,,,,,,
2201920,akk-x-stdbab-949,*,P247829.13.2,o 9',P247829,,N,terterru,(a bird),(a bird),...,,,,,,,,,,
2201921,akk-x-stdbab-949,*,P247829.14.2,o 10',P247829,,N,dubdubbu,(a bird),(a bird),...,,,,,,,,,,


In [None]:
#merge all sumerian texts
sux_dfs = [sux, sux_947, sux_x_emesal, sux_x_gloss, sux_x_syll]
sumerians = pd.concat(sux_dfs, ignore_index=True)
sumerians

Unnamed: 0,lang,form,id_word,label,id_text,gdl,pos,cf,gw,sense,...,headform,contrefs,norm0,base,morph,stem,cont,syntax_ub-after,morph2,aform
0,sux,na.re.eš,P395047.14.2,o i 9,P395047,"[{'gg': 'group', 'gdl_type': 'group', 'group':...",X,,,,...,,,,,,,,,,
1,sux,mi.in.de.eš,P395047.14.3,o i 9,P395047,"[{'gg': 'group', 'gdl_type': 'group', 'group':...",X,,,,...,,,,,,,,,,
2,sux,na.re.eš,P395047.14.4,o i 9,P395047,"[{'gg': 'group', 'gdl_type': 'group', 'group':...",X,,,,...,,,,,,,,,,
3,sux,hu.ul.pa.ak,P395047.14.5,o i 9,P395047,"[{'gg': 'group', 'gdl_type': 'group', 'group':...",X,,,,...,,,,,,,,,,
4,sux,x,P395047.14.6,o i 9,P395047,"[{'v': 'x', 'id': 'P395047.14.6.0', 'breakStar...",u,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5007881,sux-x-syll,zu-ʾa₃-ar-ša,P240975.63.1,o v 4,P240975,"[{'v': 'zu', 'id': 'P240975.63.1.0', 'delim': ...",,,,,...,,,,,,,,,,
5007882,sux-x-syll,ga-a-gi,P240975.68.1,o v 9,P240975,"[{'v': 'ga', 'id': 'P240975.68.1.0', 'delim': ...",,,,,...,,,,,,,,,,
5007883,sux-x-syll,šu-gur-šu-gur,P240975.71.1,o v 12,P240975,"[{'v': 'šu', 'id': 'P240975.71.1.0', 'delim': ...",,,,,...,,,,,,,,,,
5007884,sux-x-syll,ga-us₂,P240975.74.1,o v 15,P240975,"[{'v': 'ga', 'id': 'P240975.74.1.0', 'delim': ...",,,,,...,,,,,,,,,,


In [None]:
#exports akkadian words to the folder LANGUAGE_CSVS
akkadians.to_csv(folder + 'LANGUAGE_CSVS/akkadians.csv')

#exports sumerian words to the folder LANGUAGE_CSVS
sumerians.to_csv(folder + 'LANGUAGE_CSVS/sumerians.csv')