In [1]:
import json
import codecs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
#pd.options.display.max_rows = 50
#pd.reset_option('display')

<h2>Gathering the Data</h2>
<p>In this initial section, we will gather all of the cuneiform sign transliterations from the JSON files in our dataset. Then we will consolidate them into a data frame and add other columns to help our computations.</p>

<p>First, we will load a map from sign value to sign name to use on the signs in our texts. The OGSL is... (website...)</p>

In [2]:
file_ogsl = codecs.open('ogsl-sl.json','r','utf-8')
ogsl = json.load(file_ogsl)
sign_index = ogsl['index']

<h3>Collect the Text Signs</h3>
<p>The following code parses the JSON files of the ORACC texts and collects each sign transliteration. Since different signs have different types of reading, they are rendered differently in the JSON file and we must take care to recognize each sign reading type in the JSON file</p>
The types of signs and their representation in the JSON Files:
<ol>
    <li>Syllable - The reading of a sign as a syllable is rendered with a 'v' key</li>
    <li>Logogram - The reading of a sign as a logogram, i.e. one represents a word in itself or as part of a complex of signs that represents a single word is written in capital letters and with a 's' key</li>
    <li>Numerical - A sign representing a number (or personal name determinative) has an extra key called 'sexified'. This gives information on the number sign's wedge structure.</li>
</ol>

In addition, a modified sign can be any of the three types above, but written with a nonstandard paleography (e.g. a diagonal wedge is incised in the clay instead of a horizontal). These are the signs we want to examine. They have extra data given under the 'mods' key.

In [3]:
def process_signs(sign_data):
    sign_info = {}
    if 'v' in sign_data: #This is the label for a standard syllable
        sign_info['b'] = sign_data['v']
    if 's' in sign_data: #This is the label for elements of a logogram
        sign_info['b'] = sign_data['s']
    if 'n' in sign_data:
        sign_info['b'] = sign_data.get('sexified',sign_data.get('form','noform?'))
    if 'mods' in sign_data:
        for m in sign_data['mods']:
            for d in m:
                sign_info[d] = m[d]
    if 'break' in sign_data:
        sign_info['break'] = sign_data['break']
    sign_info['sign_loc_id'] = sign_data.get('id','no-id')
    return sign_info    

In [4]:
types = set()
all_signs = []
all_words = []
for fname in os.listdir('sargonletters/corpusjson'):
    f = codecs.open('sargonletters/corpusjson/'+fname,'r','utf-8')
    try:
        j = json.load(f)
    except ValueError:
        print('Could not load: ' + fname)
        continue
    text_id = j['textid']
    for a in j['cdl'][0]['cdl']:
        if a.get('type','') == 'discourse':
            for b in a['cdl']:
                if b.get('type','') == 'sentence':
                    line_label = ''                    
                    for c in b['cdl']:
                        if c.get('node','') == 'd': #This is the label for the line e.g. "o ii 3"
                            line_label = c.get('label','nolabel')
                        if c.get('node','') == 'l': #This is the label for a regular word in a line
                            if c.get('tail-sig','') != '': #An extra word??
                                continue
                            form = c['f']['form']
                            frag = c['frag']
                            ref  = c['ref']
                            cf   = c['f'].get('cf','no-cf')
                            gw   = c['f'].get('gw','no-gw')
                            pos  = c['f']['pos']
                            sense = c['f'].get('sense','no-sense')
                            norm = c['f'].get('norm','no-norm')
                            epos = c['f'].get('epos','no-epos')
                            word_sign_tot = len(c['f']['gdl'])
                            word_info = {'file':fname,'line_label':line_label,'form': form,'frag': frag, 'text_id': text_id, 'ref': ref,'cf': cf,'gw': gw,'pos': pos,'epos':epos,'sense':sense,'word_sign_tot':word_sign_tot,'norm':norm}
                            all_words.append(word_info)
                            for sign_data in c['f']['gdl']:
                                if sign_data.get('det','') == 'semantic':
                                    for sd in sign_data['seq']:
                                        if sd.get('gg','') == 'logo':
                                            for g in sd['group']:
                                                sign_info = process_signs(g)
                                                sign_info.update(word_info)
                                                all_signs.append(sign_info)
                                        else:
                                            sign_info = process_signs(sd)
                                            sign_info.update(word_info)
                                            all_signs.append(sign_info)
                                elif sign_data.get('gg','') == 'logo':
                                    for g in sign_data['group']:
                                        if g.get('det','') == 'semantic':
                                            for sd in g['seq']:
                                                if sd.get('gg','') == 'logo':
                                                    for gg in sd['group']:
                                                        sign_info = process_signs(gg)
                                                        sign_info.update(word_info)
                                                        all_signs.append(sign_info)                                       
                                                else:
                                                    sign_info = process_signs(sd)
                                                    sign_info.update(word_info)
                                                    all_signs.append(sign_info)                                       
                                        else:
                                            sign_info = process_signs(g)
                                            sign_info.update(word_info)
                                            all_signs.append(sign_info)                                        
                                else:
                                    sign_info = process_signs(sign_data)
                                    sign_info.update(word_info)
                                    all_signs.append(sign_info)
                        '''
                        if c.get('node','') == 'c': #This is the label for a phrase. This seems to no longer be used 
                            for d in c['cdl']:
                                if d.get('node','') == 'l':
                                    form = d['f']['form']
                                    for sign_data in d['f']['gdl']:
                                        if sign_data.get('det','') == 'semantic':
                                            for sd in sign_data['seq']:
                                                sign_info = process_signs(sd)
                                                sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                                all_signs.append(sign_info)
                                        else:
                                            sign_info = process_signs(sign_data)
                                            sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                            all_signs.append(sign_info)
                        '''
                        #types.add(c.get('type','no type'))
                        
print('done')

Could not load: P314095.json
done


Now, we form our Data Frame where each row contains information on every sign in the corpus. Further limitations on which signs are significant to our purposes will be made later, but for now we will eliminate all of the signs which are labelled as "missing," (i.e. reconstructed) because any information based on their paleography or orthography cannot be ascertained.

In [5]:
df = pd.DataFrame(all_signs)
df = df.fillna('')
df

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,line_label,m,norm,pos,ref,sense,sign_loc_id,text_id,word_sign_tot
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,o 1,,abat,N,P224485.2.1,word,P224485.2.1.0,P224485,2
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,o 1,,abat,N,P224485.2.1,word,P224485.2.1.1,P224485,2
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,o 1,,šarri,N,P224485.2.2,king,P224485.2.2.0,P224485,1
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,o 1,,ana,PRP,P224485.2.3,to,P224485.2.3.0,P224485,2
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,o 1,,ana,PRP,P224485.2.3,to,P224485.2.3.1,P224485,2
5,,1(diš),,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.0,P224485,4
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.1,P224485,4
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.2,P224485,4
8,,MAN,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.3,P224485,4
9,,PAB,damaged,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.4,P224485,4


<h3>Setting Up the Data for Clustering</h3>
<p>The general goal is to assign a vector to each text that reflects the usage of variant orthography and paleography.</p>
<ol>
    <li>Paleography - Any one set of wedges that we classify as a sign can be impressed on the clay in different ways. For example, a wedge can be missing or one can be added. Also, the tilt of a wedge can variable. These are the features we want to examine in order to see if one text prefers one sign writing or another.</li>
    <li>Orthography - Due to the homophony of the cuneiform writing system, one syllable can be written with many signs. For example, 'li' can be written with the LI-sign but also with the NI-sign, in which case it would be transliterated as li<sub>2</sub></li>
    <li>Orthography (Words) - In addition to syllables being written by variant sign forms, a word (with appropriate inflection) can be written in different ways similar to how "color" is written "colour" in British English. These two variant orthographies indicate the same word but their variation might indicate something about the author.</li>
</ol>

<p>This section therefore contains three subsections. One groups the diagnostic signs with or without modifications per text to. The next discovers the homophonous signs used throughout the corpus and groups different usages per text. The third groups the word forms in the corpus.</p>

First of all, let's create more columns in the data frame to aid us
<ol>
<li>sign_form - Using the OGSL mapping we created earlier, we can assign each reading of a sign to its sign form. This will help us combine different readings under the same sign form to help us mark variations in paleography</li>
<li>mods_str - Since the data contains three columns currently with information on variable paleography, it would help us to consolidate them into one column</li>
<li>combined - This column combines sign_form and mods_str into one string similar to how the <i>b</i> column is a combination of str_part and num_part</li>
<li>str_part and num_part - In order to determine which signs share a syllabic value, it will be useful to separate the transliterated readings into their string components and numerical components. Once we do this, we can group rows with the same str_part and count up the different usages of homophonous signs</li>
</ol>

In [6]:
file_names = df['file'].unique()
df['sign_form'] = df['b'].apply(lambda x: sign_index.get(x.lower(),'?'))
df['mods_str'] = df['a'] + '.' + df['f']  + '.' + df['m']

import re
def get_num_part(s):
    try:
        n = re.findall(r'[₀₁₂₃₄₅₆₇₈₉]+',s)[0]
        n = n.replace('₀','0').replace('₁','1').replace('₂','2').replace('₃','3').replace('₄','4')
        n = n.replace('₅','5').replace('₆','6').replace('₇','7').replace('₈','8').replace('₉','9')
    except:
        n = 1
    return n
def get_str_part(s):
    try:
        n = re.findall(r'[a-zA-ZšŠṣṢṭṬʾ \(\)0-9]+',s)[0]
    except:
        n = s
    return n
        
df['str_part'] = df['b'].apply(lambda x: get_str_part(x))
df['num_part'] = df['b'].apply(lambda x: get_num_part(x))
df['combined'] = df['sign_form'] + ':' + df['mods_str']
df

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,...,ref,sense,sign_loc_id,text_id,word_sign_tot,sign_form,mods_str,str_part,num_part,combined
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.0,P224485,2,A,..,a,1,A:..
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.1,P224485,2,BAD,..,bat,1,BAD:..
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,...,P224485.2.2,king,P224485.2.2.0,P224485,1,LUGAL,..,LUGAL,1,LUGAL:..
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.0,P224485,2,A,..,a,1,A:..
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.1,P224485,2,,.t.,na,1,NA:.t.
5,,1(diš),,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.0,P224485,4,DIŠ,..,1(diš),1,DIŠ:..
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.1,P224485,4,AŠ,..,aš,1,AŠ:..
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.2,P224485,4,SUR,..,šur,1,SUR:..
8,,MAN,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.3,P224485,4,|U.U|,..,MAN,1,|U.U|:..
9,,PAB,damaged,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.4,P224485,4,PAP,..,PAB,1,PAP:..


<h3>Paleography Setup</h3>
<p>The Data Frame we have contains the entire collection of signs in the corpus. However, not every sign has variants in paleography (at least according to Parpola's data input). We only want to look at the signs which have these variants, which we will term diagnostic. In the data, they are the signs that include any type of modification</p>

In [7]:
df2 = df[~(df['mods_str'] == '..')]
list_mod_signs = sorted(list(df2['sign_form'].unique()))

df_paleo = df[df['sign_form'].isin(list_mod_signs)]

#remove damaged signs too
df_paleo = df_paleo[df_paleo['break'] != 'damaged']
df_paleo

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,...,ref,sense,sign_loc_id,text_id,word_sign_tot,sign_form,mods_str,str_part,num_part,combined
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.1,P224485,2,BAD,..,bat,1,BAD:..
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,...,P224485.2.2,king,P224485.2.2.0,P224485,1,LUGAL,..,LUGAL,1,LUGAL:..
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.0,P224485,2,A,..,a,1,A:..
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.1,P224485,2,,.t.,na,1,NA:.t.
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.2,P224485,4,SUR,..,šur,1,SUR:..
11,,mu,,šulmu,N,,P224485.json,šul-mu,šul⸣-mu,completeness,...,P224485.2.5,health,P224485.2.5.1,P224485,2,MU,..,mu,1,MU:..
12,,ia,,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.0,P224485,3,|I.A|,..,ia,1,|I.A|:..
14,,ši,,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.2,P224485,3,IGI,..,ši,1,IGI:..
16,,mu,,šulmu,N,,P224485.json,šul-mu,⸢šul⸣-mu,completeness,...,P224485.3.1,health,P224485.3.1.1,P224485,2,MU,..,mu,1,MU:..
17,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.3.2,to,P224485.3.2.0,P224485,2,A,..,a,1,A:..


Create the text matrix with raw counts

In [8]:
df_paleo_str = pd.DataFrame(df_paleo.groupby(['text_id']).apply(lambda x: ' '.join(x['combined'])))
df_paleo_str.columns = ['paleo_str']
df_paleo_str

cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_paleo_str['paleo_str']))
tm_paleo = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_paleo_str.index)
tm_paleo

Unnamed: 0_level_0,A:..,A:.d.,AB@g:..,AB@g:.m.,AB@g:.p.,AB₂:..,AB₂:.d.,AK:..,AK:.d.,AK:.dt.,...,ŠIM:..,ŠIM:.d.,ŠIM:.p.,ŠIM:.t.,ŠU:..,ŠU:.d.,ŠU:.m.,ŠU₂:..,ŠU₂:.d.,ŠU₂:.t.
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P224485,104,0,1,0,0,1,0,1,0,0,...,3,0,0,2,4,0,0,28,0,0
P237089,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
P238649,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P313416,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
P313417,39,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8,0,0,0,0,0
P313419,16,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2,0,0
P313420,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
P313421,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
P313422,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
P313425,47,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,1,0,0,0


<h3>Orthography Syllable Setup</h3>
<p>We now limit the original data frame in different way based on orthography. First we need to figure out which syllabic readings have multiple signs that can render them. We then eliminate capital letter entries because indices on logograms indicate different words and are not relevant here. Last, we limit the data frame to only these signs.

In [9]:
df2 = pd.DataFrame(df.groupby(['str_part'])['num_part'].agg('nunique'))
list_ortho_syls = list(df2[df2[('num_part')] > 1].index)

list_ortho_syls = [h for h in list_ortho_syls if len(re.findall(r'[A-Z]',h)) == 0]
list_ortho_syls

df_ortho_signs = df[df['str_part'].isin(list_ortho_syls)]
df_ortho_signs

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,...,ref,sense,sign_loc_id,text_id,word_sign_tot,sign_form,mods_str,str_part,num_part,combined
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.0,P224485,2,A,..,a,1,A:..
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.0,P224485,2,A,..,a,1,A:..
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.1,P224485,4,AŠ,..,aš,1,AŠ:..
12,,ia,,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.0,P224485,3,|I.A|,..,ia,1,|I.A|:..
13,,a,damaged,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.1,P224485,3,A,..,a,1,A:..
17,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.3.2,to,P224485.3.2.0,P224485,2,A,..,a,1,A:..
20,,aš,,Mat-Aššur,GN,,P224485.json,KUR-aš-šur{KI},KUR-aš-šur{ki},Assyria,...,P224485.3.3,Assyria,P224485.3.3.1,P224485,4,AŠ,..,aš,1,AŠ:..
24,,ka,missing,libbu,N,,P224485.json,ŠA₃-ka,⸢ŠA₃⸣-[ka],interior,...,P224485.3.4,mood,P224485.3.4.1,P224485,2,KA,..,ka,1,KA:..
28,,ka,,ṭābu,AJ,,P224485.json,DUG₃.GA-ka,DUG₃.GA-ka,good,...,P224485.3.6,good,P224485.3.6.2,P224485,2,KA,..,ka,1,KA:..
29,,ša,damaged,ša,REL,,P224485.json,ša,⸢ša⸣,that,...,P224485.4.1,what,P224485.4.1.0,P224485,1,ŠA,..,ša,1,ŠA:..


Create the text matrix with raw counts

In [10]:
df_ortho_str = pd.DataFrame(df_ortho_signs.groupby(['text_id']).apply(lambda x: ' '.join(x['b'])))
df_ortho_str.columns = ['ortho_str']
df_ortho_str

cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_ortho_str['ortho_str']))
tm_ortho_sign = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_ortho_str.index)
tm_ortho_sign

Unnamed: 0_level_0,a,ana,ana₃,ar,ar₂,aš,aš₂,a₂,be,be₂,...,ša₂,šu,šum,šum₂,šu₂,ṭe,ṭe₂,ṭe₃,ṭi,ṭi₂
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P224485,103,2,0,6,0,5,8,0,7,2,...,5,4,1,0,33,0,0,1,0,0
P237089,5,0,0,0,0,0,0,0,2,0,...,0,1,0,0,0,0,0,0,0,0
P238649,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,2,0,0
P313416,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
P313417,36,0,0,0,0,1,2,0,0,0,...,0,8,0,0,0,1,0,1,1,0
P313419,16,0,0,0,0,0,0,0,0,0,...,1,0,0,0,2,0,0,1,0,1
P313420,14,1,0,1,0,1,0,0,6,0,...,0,0,0,0,3,0,0,1,0,0
P313421,7,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
P313422,11,1,0,2,0,1,1,0,1,1,...,0,0,0,0,4,0,0,0,0,0
P313425,51,0,0,5,0,0,1,0,5,0,...,3,3,1,1,0,0,0,0,0,0


<h3>Orthography Word Setup</h3>
<p>The setup for this section is slightly different. Instead of counting up every sign or reading, we count up every word, its normalization and its form. To help us out we will create two new columns <i>lemma_norm</i> and <i>lemma_norm_form</i>. The first combines the lemma and the normalized form. The latter attaches the transliteration to the former. We only want to include lemma/norm combinations that have multiple forms associated with them.

In [11]:
df_words = pd.DataFrame(all_words)
df_words = df_words[(df_words['cf'] != 'no-cf')]

df_words['lemma'] = df_words['cf'] + '[' + df_words['gw'] + ']' + df_words['pos']
df_words['lemma_norm'] = df_words['lemma'] + ':' + df_words['norm']
df_words['lemma_norm_form'] = df_words['lemma_norm'] + ':' + df_words['form']

df_norm_uniq = pd.DataFrame(df_words.groupby('lemma_norm')['form'].nunique())
list_ortho_words = list(df_norm_uniq[df_norm_uniq[('form')] > 1].index)

df_form_50 = pd.DataFrame(df_words.groupby('lemma_norm_form')['form'].agg('count'))
list_form_50 = list(df_form_50[df_form_50[('form')] > 50].index)

df_ortho_words = df_words[(df_words['lemma_norm'].isin(list_ortho_words)) & df_words['lemma_norm_form'].isin(list_form_50)]
df_ortho_words

Unnamed: 0,cf,epos,file,form,frag,gw,line_label,norm,pos,ref,sense,text_id,word_sign_tot,lemma,lemma_norm,lemma_norm_form
1,šarru,N,P224485.json,LUGAL,LUGAL,king,o 1,šarri,N,P224485.2.2,king,P224485,1,šarru[king]N,šarru[king]N:šarri,šarru[king]N:šarri:LUGAL
2,ana,PRP,P224485.json,a-na,a-na\t,to,o 1,ana,PRP,P224485.2.3,to,P224485,2,ana[to]PRP,ana[to]PRP:ana,ana[to]PRP:ana:a-na
4,šulmu,N,P224485.json,šul-mu,šul⸣-mu,completeness,o 1,šulmu,N,P224485.2.5,health,P224485,2,šulmu[completeness]N,šulmu[completeness]N:šulmu,šulmu[completeness]N:šulmu:šul-mu
6,šulmu,N,P224485.json,šul-mu,⸢šul⸣-mu,completeness,o 2,šulmu,N,P224485.3.1,health,P224485,2,šulmu[completeness]N,šulmu[completeness]N:šulmu,šulmu[completeness]N:šulmu:šul-mu
7,ana,PRP,P224485.json,a-na,a-na\t,to,o 2,ana,PRP,P224485.3.2,to,P224485,2,ana[to]PRP,ana[to]PRP:ana,ana[to]PRP:ana:a-na
10,lū,MOD,P224485.json,lu,⸢lu⸣,may,o 2,lū,MOD,P224485.3.5,may,P224485,1,lū[may]MOD,lū[may]MOD:lū,lū[may]MOD:lū:lu
12,ša,REL,P224485.json,ša,⸢ša⸣,that,o 3,ša,REL,P224485.4.1,what,P224485,1,ša[that]REL,ša[that]REL:ša,ša[that]REL:ša:ša
14,mā,PRP,P224485.json,ma-a,ma-a,saying,o 3,mā,PRP,P224485.4.3,saying,P224485,2,mā[saying]PRP,mā[saying]PRP:mā,mā[saying]PRP:mā:ma-a
16,ša,DET,P224485.json,ša,[ša],of,o 3,ša,DET,P224485.4.5,of,P224485,1,ša[of]DET,ša[of]DET:ša,ša[of]DET:ša:ša
19,ina,PRP,P224485.json,ina,ina,in,o 4,ina,PRP,P224485.5.2,in,P224485,1,ina[in]PRP,ina[in]PRP:ina,ina[in]PRP:ina:ina


Create the text matrix with raw counts

In [12]:
df_ortho_wordstr = pd.DataFrame(df_ortho_words.groupby(['text_id']).apply(lambda x: ' '.join(x['lemma_norm_form'])))
df_ortho_wordstr.columns = ['ortho_wordstr']
df_ortho_str

cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_ortho_wordstr['ortho_wordstr']))
tm_ortho_word = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_ortho_wordstr.index)
tm_ortho_word.to_csv('output/tm_ortho_word.csv',encoding='utf-8',sep='\t')
tm_ortho_word

Unnamed: 0_level_0,Urarṭaya[Urarṭian]EN:Urarṭaya:{KUR}URI-a.a,adanniš[very,adi[until]PRP:adi:a-di,akī[as]PRP:akī:a-ki,alāku[go]V:ittalka:it-tal-ka,ammar[as,ana[to]PRP:ana:a-na,annûri[now]AV:annurig:an-nu-rig,anāku[I]IP:anāku:a-na-ku,ardu[slave]N:urdaka:ARAD-ka,...,šumma[if]MOD:šumma:šum₂-ma,šumma[if]MOD:šummu:šum₂-mu,šunu[they]IP:šunu:šu-nu,šū[he]IP:šû:šu-u,šū[he]IP:šû:šu-u₂,ūma[today]AV:ūmâ:u₂-ma-a,ṣābu[people]N:ṣābāni:ERIM-MEŠ,ṣābu[people]N:ṣābāni:{LU₂}ERIM-MEŠ,ṭābu[good]AJ:ṭāb:DUG₃.GA,ṭēmu[(fore)thought]N:ṭēmu:ṭe₃-e-mu
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P224485,0,2,0,0,2,1,11,4,0,0,...,0,0,0,0,1,3,3,0,0,0
P237089,0,0,0,0,0,0,2,0,0,1,...,0,0,0,0,0,0,0,0,1,0
P238649,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
P313416,0,0,0,0,0,0,3,0,0,1,...,0,0,0,0,0,0,0,0,0,0
P313417,0,0,0,0,0,1,7,0,0,1,...,0,0,0,0,0,2,1,0,0,1
P313419,0,0,0,0,0,0,5,1,0,1,...,0,0,0,0,0,0,0,0,0,1
P313420,0,0,1,0,0,0,3,0,0,1,...,0,0,0,0,0,0,1,0,0,1
P313421,0,0,0,0,0,0,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
P313422,0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
P313425,0,1,0,0,0,0,10,0,3,1,...,1,0,1,0,1,2,0,0,0,0


<h3>Selecting the Features</h3>

In [32]:
df_paleo_count = pd.DataFrame(df_paleo.groupby(['sign_form','combined'])['cf'].agg('count'))
df_paleo_count = df_paleo_count[df_paleo_count[('cf')] > 100]

df_paleo_uniq = df_paleo_count.groupby([('sign_form')]).nunique()

df_paleo_count = df_paleo_count.loc[list(df_paleo_uniq[df_paleo_uniq[('cf')] > 1].index)]
df_paleo_count

Unnamed: 0_level_0,Unnamed: 1_level_0,cf
sign_form,combined,Unnamed: 2_level_1
BA,BA:..,401
BA,BA:.t.,120
BU,BU:..,960
BU,BU:.p.,109
DA,DA:..,434
DA,DA:.d.,201
DI,DI:..,1024
DI,DI:.d.,156
LI,LI:..,465
LI,LI:.d.,147


In [36]:
df_ortho_sign_count = pd.DataFrame(df_ortho_signs.groupby(['str_part','b'])['cf'].agg('count'))
df_ortho_sign_count = df_ortho_sign_count[df_ortho_sign_count[('cf')] > 100]

df_ortho_sign_uniq = df_ortho_sign_count.groupby([('str_part')]).nunique()

df_ortho_sign_count = df_ortho_sign_count.loc[list(df_ortho_sign_uniq[df_ortho_sign_uniq[('cf')] > 1].index)]
df_ortho_sign_count

Unnamed: 0_level_0,Unnamed: 1_level_0,cf
str_part,b,Unnamed: 2_level_1
aš,aš,392
aš,aš₂,427
ia,ia,1724
ia,ia₂,424
li,li,701
li,li₂,1191
tu,tu,440
tu,tu₂,533
u,u,1503
u,u₂,1843


In [39]:
df_ortho_word_count = pd.DataFrame(df_words.groupby(['lemma_norm','lemma_norm_form'])['cf'].agg('count'))
df_ortho_word_count = df_ortho_word_count[df_ortho_word_count[('cf')] > 50]

df_ortho_word_uniq = df_ortho_word_count.groupby([('lemma_norm')]).nunique()

df_ortho_word_count = df_ortho_word_count.loc[list(df_ortho_word_uniq[df_ortho_word_uniq[('cf')] > 1].index)]
df_ortho_word_count

Unnamed: 0_level_0,Unnamed: 1_level_0,cf
lemma_norm,lemma_norm_form,Unnamed: 2_level_1
bēlu[lord]N:bēlī,bēlu[lord]N:bēlī:EN,182
bēlu[lord]N:bēlī,bēlu[lord]N:bēlī:be-li₂,642
bēlu[lord]N:bēlīya,bēlu[lord]N:bēlīya:EN-a,76
bēlu[lord]N:bēlīya,bēlu[lord]N:bēlīya:EN-ia,770
bēlu[lord]N:bēlīya,bēlu[lord]N:bēlīya:EN-ia₂,190
bēlu[lord]N:bēlīya,bēlu[lord]N:bēlīya:be-li₂-ia,309
bēlu[lord]N:bēlīya,bēlu[lord]N:bēlīya:be-li₂-ia₂,194
ištu[from]PRP:issu,ištu[from]PRP:issu:TA,113
ištu[from]PRP:issu,ištu[from]PRP:issu:TA@v,346
libbu[interior]N:libbi,libbu[interior]N:libbi:ŠA₃,217
