In [1]:
import json
import codecs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
#pd.options.display.max_rows = 50
#pd.reset_option('display')

<h2>0. Gathering the Data</h2>
<p>In this preliminary section, we will gather all of the cuneiform sign transliterations from the JSON files in our dataset. Then we will consolidate them into a data frame and match each sign value with its sign name</p>

(Do we need to discuss the basics of cuneiform transliteration or is it assumed that our audience is familiar with it?)

<h3>0.1: OGSL</h3>
<p>Now, we will load a map from sign value to sign name to use on the signs in our texts. The OGSL is... (website...)</p>

In [2]:
file_ogsl = codecs.open('ogsl-sl.json','r','utf-8')
ogsl = json.load(file_ogsl)
sign_index = ogsl['index']

<h3>0.2: Collect the Text Signs</h3>
<p>The following code parses the JSON files of the ORACC texts and collects each sign transliteration. Since different signs have different types of reading, they are rendered differently in the JSON file and we must take care to recognize each sign reading type in the JSON file</p>
The types of signs and their representation in the JSON Files:
<ol>
    <li>Syllable - The reading of a sign as a syllable is rendered with a 'v' key</li>
    <li>Logogram - The reading of a sign as a logogram, i.e. one represents a word in itself or as part of a complex of signs that represents a single word is written in capital letters and with a 's' key</li>
    <li>Numerical - A sign representing a number (or personal name determinative) has an extra key called 'sexified'. This gives information on the number sign's wedge structure.</li>
</ol>

In addition, a modified sign can be any of the three types above, but written with a nonstandard paleography (e.g. a diagonal wedge is incised in the clay instead of a horizontal). These are the signs we want to examine. They have extra data given under the 'mods' key.

In [3]:
def process_signs(sign_data):
    sign_info = {}
    if 'v' in sign_data: #This is the label for a standard syllable
        sign_info['b'] = sign_data['v']
    if 's' in sign_data: #This is the label for elements of a logogram
        sign_info['b'] = sign_data['s']
    if 'n' in sign_data:
        sign_info['b'] = sign_data.get('sexified',sign_data.get('form','noform?'))
    if 'mods' in sign_data:
        for m in sign_data['mods']:
            for d in m:
                sign_info[d] = m[d]
    if 'break' in sign_data:
        sign_info['break'] = sign_data['break']
    sign_info['sign_loc_id'] = sign_data.get('id','no-id')
    return sign_info    

In [4]:
types = set()
all_signs = []
all_words = []
for fname in os.listdir('sargonletters/corpusjson'):
    f = codecs.open('sargonletters/corpusjson/'+fname,'r','utf-8')
    try:
        j = json.load(f)
    except ValueError:
        print('Could not load: ' + fname)
        continue
    text_id = j['textid']
    for a in j['cdl'][0]['cdl']:
        if a.get('type','') == 'discourse':
            for b in a['cdl']:
                if b.get('type','') == 'sentence':
                    line_label = ''                    
                    for c in b['cdl']:
                        if c.get('node','') == 'd': #This is the label for the line e.g. "o ii 3"
                            line_label = c.get('label','nolabel')
                        if c.get('node','') == 'l': #This is the label for a regular word in a line
                            if c.get('tail-sig','') != '': #An extra word??
                                continue
                            form = c['f']['form']
                            frag = c['frag']
                            ref  = c['ref']
                            cf   = c['f'].get('cf','no-cf')
                            gw   = c['f'].get('gw','no-gw')
                            pos  = c['f']['pos']
                            sense = c['f'].get('sense','no-sense')
                            norm = c['f'].get('norm','no-norm')
                            epos = c['f'].get('epos','no-epos')
                            word_sign_tot = len(c['f']['gdl'])
                            word_info = {'file':fname,'line_label':line_label,'form': form,'frag': frag, 'text_id': text_id, 'ref': ref,'cf': cf,'gw': gw,'pos': pos,'epos':epos,'sense':sense,'word_sign_tot':word_sign_tot,'norm':norm}
                            all_words.append(word_info)
                            for sign_data in c['f']['gdl']:
                                if sign_data.get('det','') == 'semantic':
                                    for sd in sign_data['seq']:
                                        if sd.get('gg','') == 'logo':
                                            for g in sd['group']:
                                                sign_info = process_signs(g)
                                                sign_info.update(word_info)
                                                all_signs.append(sign_info)
                                        else:
                                            sign_info = process_signs(sd)
                                            sign_info.update(word_info)
                                            all_signs.append(sign_info)
                                elif sign_data.get('gg','') == 'logo':
                                    for g in sign_data['group']:
                                        if g.get('det','') == 'semantic':
                                            for sd in g['seq']:
                                                if sd.get('gg','') == 'logo':
                                                    for gg in sd['group']:
                                                        sign_info = process_signs(gg)
                                                        sign_info.update(word_info)
                                                        all_signs.append(sign_info)                                       
                                                else:
                                                    sign_info = process_signs(sd)
                                                    sign_info.update(word_info)
                                                    all_signs.append(sign_info)                                       
                                        else:
                                            sign_info = process_signs(g)
                                            sign_info.update(word_info)
                                            all_signs.append(sign_info)                                        
                                else:
                                    sign_info = process_signs(sign_data)
                                    sign_info.update(word_info)
                                    all_signs.append(sign_info)
                        '''
                        if c.get('node','') == 'c': #This is the label for a phrase. This seems to no longer be used 
                            for d in c['cdl']:
                                if d.get('node','') == 'l':
                                    form = d['f']['form']
                                    for sign_data in d['f']['gdl']:
                                        if sign_data.get('det','') == 'semantic':
                                            for sd in sign_data['seq']:
                                                sign_info = process_signs(sd)
                                                sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                                all_signs.append(sign_info)
                                        else:
                                            sign_info = process_signs(sign_data)
                                            sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                            all_signs.append(sign_info)
                        '''
                        #types.add(c.get('type','no type'))
                        
print('done')

Could not load: P314095.json
done


Now, we form our Data Frame where each row contains information on every sign in the corpus. Further limitations on which signs are significant to our purposes will be made later, but for now we will eliminate all of the signs which are labelled as "missing," (i.e. reconstructed) because any information based on their paleography or orthography cannot be ascertained.

In [5]:
df = pd.DataFrame(all_signs)
df = df.fillna('')
df

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,line_label,m,norm,pos,ref,sense,sign_loc_id,text_id,word_sign_tot
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,o 1,,abat,N,P224485.2.1,word,P224485.2.1.0,P224485,2
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,o 1,,abat,N,P224485.2.1,word,P224485.2.1.1,P224485,2
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,o 1,,šarri,N,P224485.2.2,king,P224485.2.2.0,P224485,1
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,o 1,,ana,PRP,P224485.2.3,to,P224485.2.3.0,P224485,2
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,o 1,,ana,PRP,P224485.2.3,to,P224485.2.3.1,P224485,2
5,,1(diš),,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.0,P224485,4
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.1,P224485,4
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.2,P224485,4
8,,MAN,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.3,P224485,4
9,,PAB,damaged,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.4,P224485,4


<h3>1. Setting Up the Data for Clustering</h3>
<p>The general goal is to assign a vector to each text that reflects the usage of variant orthography and paleography.</p>
<ol>
    <li>Paleography - Any one set of wedges that we classify as a sign can be impressed on the clay in different ways. For example, a wedge can be missing or one can be added. Also, the tilt of a wedge can variable. These are the features we want to examine in order to see if one text prefers one sign writing or another.</li>
    <li>Orthography - Due to the homophony of the cuneiform writing system, one syllable can be written with many signs. For example, 'li' can be written with the LI-sign but also with the NI-sign, in which case it would be transliterated as li<sub>2</sub></li>
</ol>
<p>Other variables can be applied to a text as attributes in its vector. (What are these? We talked about things like Provenence, city information, scribe information. Also, if we apply different types of variables how can we use a clustering algorithm to treat these vector components as a different entity?).</p>
<p>This section therefore contains two subsections. One groups the diagnostic signs with or without modifications per text to. The other discovers the homophonous signs used throughout the corpus and groups different usages per text</p>

First of all, let's create more columns in the data frame to aid us
<ol>
<li>mods_str - Since the data contains three columns currently with information on variable paleography, it would help us to consolidate them into one column</li>
<li>str_part and num_part - In order to determine which signs share a syllabic value, it will be useful to separate the transliterated readings into their string components and numerical components. Once we do this, we can group rows with the same str_part and count up the different usages of homophonous signs</li>

In [6]:
file_names = df['file'].unique()
df['sign_form'] = df['b'].apply(lambda x: sign_index.get(x.lower(),'?'))
df['mods_str'] = df['a'] + '.' + df['f']  + '.' + df['m']

import re
def get_num_part(s):
    try:
        n = re.findall(r'[₀₁₂₃₄₅₆₇₈₉]+',s)[0]
        n = n.replace('₀','0').replace('₁','1').replace('₂','2').replace('₃','3').replace('₄','4')
        n = n.replace('₅','5').replace('₆','6').replace('₇','7').replace('₈','8').replace('₉','9')
    except:
        n = 1
    return n
def get_str_part(s):
    try:
        n = re.findall(r'[a-zA-ZšŠṣṢṭṬʾ \(\)0-9]+',s)[0]
    except:
        n = s
    return n
        
df['str_part'] = df['b'].apply(lambda x: get_str_part(x))
df['num_part'] = df['b'].apply(lambda x: get_num_part(x))
df

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,...,pos,ref,sense,sign_loc_id,text_id,word_sign_tot,sign_form,mods_str,str_part,num_part
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,N,P224485.2.1,word,P224485.2.1.0,P224485,2,A,..,a,1
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,N,P224485.2.1,word,P224485.2.1.1,P224485,2,BAD,..,bat,1
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,...,N,P224485.2.2,king,P224485.2.2.0,P224485,1,LUGAL,..,LUGAL,1
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,PRP,P224485.2.3,to,P224485.2.3.0,P224485,2,A,..,a,1
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,...,PRP,P224485.2.3,to,P224485.2.3.1,P224485,2,,.t.,na,1
5,,1(diš),,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,PN,P224485.2.4,1,P224485.2.4.0,P224485,4,DIŠ,..,1(diš),1
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,PN,P224485.2.4,1,P224485.2.4.1,P224485,4,AŠ,..,aš,1
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,PN,P224485.2.4,1,P224485.2.4.2,P224485,4,SUR,..,šur,1
8,,MAN,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,PN,P224485.2.4,1,P224485.2.4.3,P224485,4,|U.U|,..,MAN,1
9,,PAB,damaged,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,PN,P224485.2.4,1,P224485.2.4.4,P224485,4,PAP,..,PAB,1


Before we go into the process let's create some preliminary output for various purposes

In [7]:
df_modonly = df[df['mods_str'] != '..']
df_modonly_file = df_modonly[['sign_form','text_id','b','sign_loc_id','f','a','m','form','frag','ref','break','cf','gw','pos','epos','sense','line_label']]
df_modonly_file.to_csv('output/sign_mods_list.csv',encoding='utf-8')
df_modonly_file

Unnamed: 0,sign_form,text_id,b,sign_loc_id,f,a,m,form,frag,ref,break,cf,gw,pos,epos,sense,line_label
4,,P224485,na,P224485.2.3.1,t,,,a-na,a-na\t,P224485.2.3,,ana,to,PRP,PRP,to,o 1
18,,P224485,na,P224485.3.2.1,t,,,a-na,a-na\t,P224485.3.2,,ana,to,PRP,PRP,to,o 2
36,LU₂,P224485,LU₂,P224485.4.4.0,,v,,{LU₂}A-šip-ri,{lu₂v}A-šip-⸢ri⸣,P224485.4.4,,māru,son,N,N,son,o 3
61,|ME.U.U.U|,P224485,MEŠ,P224485.5.7.1,m,,,ERIM-MEŠ,ERIM-MEŠ\m,P224485.5.7,,ṣābu,people,N,N,troops,o 4
73,,P224485,na,P224485.6.4.1,t,,,a-na,a-na\t,P224485.6.4,,ana,to,PRP,PRP,for,o 5
74,LU₂,P224485,LU₂,P224485.6.5.0,,v,,{LU₂}šap-ru-te,{lu₂v}šap-ru-te,P224485.6.5,,šapru,envoy,N,N,envoy,o 5
79,,P224485,na,P224485.7.1.1,t,,,a-na,a-na\t,P224485.7.1,,ana,to,PRP,PRP,to,o 6
81,URI,P224485,URI,P224485.7.2.1,d,,,{KUR}URI,{kur}URI\d,P224485.7.2,,Urarṭu,1,GN,GN,1,o 6
104,ŠIM,P224485,rig,P224485.8.3.2,t,,,an-nu-rig,an-nu-rig\t,P224485.8.3,,annûri,now,AV,AV,now,o 7
114,|ME.U.U.U|,P224485,MEŠ,P224485.9.2.1,m,,,DINGIR-MEŠ-ia,DINGIR-MEŠ\m-ia,P224485.9.2,,ilu,god,N,N,god,o 8


In [10]:
def loc_and_count(loc_id,line_label):
    locs = []
    count = 0
    for i in range(len(loc_id)):
        locs.append(loc_id[i] + ' (' + line_label[i] + ')')
        count += 1
    return [','.join(locs),count]

df_modsagg = pd.DataFrame(df_modonly.groupby(['sign_form','f','a','m']).apply(lambda row: ','.join(row['sign_loc_id'] + ' (' + row['line_label'] + ')'))).reset_index()
df_modsagg.columns = ['sign_form','f','a','m','all_locs']
df_modsagg['count'] = df_modsagg['all_locs'].apply(lambda x: len(x.split(',')))
df_modsagg.to_csv('output/sign_mods_grouped.csv',encoding='utf-8')
df_modsagg.groupby('sign_form').agg('count')

Unnamed: 0_level_0,f,a,m,all_locs,count
sign_form,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,1,1,1,1
AB@g,2,2,2,2,2
AB₂,1,1,1,1,1
AK,4,4,4,4,4
AL,4,4,4,4,4
ALEPH,2,2,2,2,2
AMAR,1,1,1,1,1
ANŠE,7,7,7,7,7
APIN,1,1,1,1,1
ARAD,3,3,3,3,3


In [13]:
df_nomods = df[df['mods_str'] == '..']
df_nomods = df_nomods[['sign_form','text_id','b','sign_loc_id','f','a','m','form','frag','ref','break','cf','gw','pos','epos','sense','line_label']]
df_nomods.to_csv('output/signs_nomods_list.csv',encoding='utf-8')
df_nomods

Unnamed: 0,sign_form,text_id,b,sign_loc_id,f,a,m,form,frag,ref,break,cf,gw,pos,epos,sense,line_label
0,A,P224485,a,P224485.2.1.0,,,,a-bat,⸢a⸣-bat,P224485.2.1,damaged,awātu,word,N,N,word,o 1
1,BAD,P224485,bat,P224485.2.1.1,,,,a-bat,⸢a⸣-bat,P224485.2.1,,awātu,word,N,N,word,o 1
2,LUGAL,P224485,LUGAL,P224485.2.2.0,,,,LUGAL,LUGAL,P224485.2.2,,šarru,king,N,N,king,o 1
3,A,P224485,a,P224485.2.3.0,,,,a-na,a-na\t,P224485.2.3,,ana,to,PRP,PRP,to,o 1
5,DIŠ,P224485,1(diš),P224485.2.4.0,,,,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,P224485.2.4,,Aššur-šarru-uṣur,1,PN,PN,1,o 1
6,AŠ,P224485,aš,P224485.2.4.1,,,,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,P224485.2.4,,Aššur-šarru-uṣur,1,PN,PN,1,o 1
7,SUR,P224485,šur,P224485.2.4.2,,,,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,P224485.2.4,,Aššur-šarru-uṣur,1,PN,PN,1,o 1
8,|U.U|,P224485,MAN,P224485.2.4.3,,,,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,P224485.2.4,,Aššur-šarru-uṣur,1,PN,PN,1,o 1
9,PAP,P224485,PAB,P224485.2.4.4,,,,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,P224485.2.4,damaged,Aššur-šarru-uṣur,1,PN,PN,1,o 1
10,DUN,P224485,šul,P224485.2.5.0,,,,šul-mu,šul⸣-mu,P224485.2.5,damaged,šulmu,completeness,N,N,health,o 1


In [14]:
df_words = pd.DataFrame(all_words)
df_words.to_csv('output/words_all.csv',encoding='utf-8')
df_words

Unnamed: 0,cf,epos,file,form,frag,gw,line_label,norm,pos,ref,sense,text_id,word_sign_tot
0,awātu,N,P224485.json,a-bat,⸢a⸣-bat,word,o 1,abat,N,P224485.2.1,word,P224485,2
1,šarru,N,P224485.json,LUGAL,LUGAL,king,o 1,šarri,N,P224485.2.2,king,P224485,1
2,ana,PRP,P224485.json,a-na,a-na\t,to,o 1,ana,PRP,P224485.2.3,to,P224485,2
3,Aššur-šarru-uṣur,PN,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485,4
4,šulmu,N,P224485.json,šul-mu,šul⸣-mu,completeness,o 1,šulmu,N,P224485.2.5,health,P224485,2
5,yâšim,IP,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,o 1,ayāši,IP,P224485.2.6,me,P224485,3
6,šulmu,N,P224485.json,šul-mu,⸢šul⸣-mu,completeness,o 2,šulmu,N,P224485.3.1,health,P224485,2
7,ana,PRP,P224485.json,a-na,a-na\t,to,o 2,ana,PRP,P224485.3.2,to,P224485,2
8,Mat-Aššur,GN,P224485.json,KUR-aš-šur{KI},KUR-aš-šur{ki},Assyria,o 2,Mat-Aššur,GN,P224485.3.3,Assyria,P224485,4
9,libbu,N,P224485.json,ŠA₃-ka,⸢ŠA₃⸣-[ka],interior,o 2,libbaka,N,P224485.3.4,mood,P224485,2


In [15]:
gloss_akkx_file = codecs.open('sargonletters/gloss-akk-x-neoass.json','r','utf-8')
gloss_akkx_json = json.load(gloss_akkx_file)
entries_akkx = gloss_akkx_json['entries']
instances_akk = gloss_akkx_json['instances']

entries_list = []
forms_list = []
instances_list = []
for entry in entries_akkx:
    entry_info = {'headword': entry['headword'], 'xis_entry': entry['xis']}
    for form in entry['forms']:
        xis_form = form['xis']
        form_info = {'form': form['n'], 'xis_form': xis_form}
        form_info.update(entry_info)
        instances_arr = []
        for inst in instances_akk[xis_form]:
            instance_info = {'instance': inst}
            instance_info.update(form_info)
            instances_list.append(instance_info)
            
            instances_arr.append(inst)
        
df_instances_akkx = pd.DataFrame(instances_list)
df_instances_akkx

Unnamed: 0,form,headword,instance,xis_entry,xis_form
0,a-ba-ak,abāku[lead away]V,saao/sargonletters:P314281.18.2,akk.r00033,akk.r00034
1,a-tab-ka,abāku[lead away]V,saao/sargonletters:P313450.15.3,akk.r00033,akk.r00035
2,e-tab-ku,abāku[lead away]V,saao/sargonletters:P313753.19.1,akk.r00033,akk.r00036
3,e-tab-ku,abāku[lead away]V,saao/sargonletters:X900012.20.9,akk.r00033,akk.r00036
4,e-tab-ku-u-ni,abāku[lead away]V,saao/sargonletters:P313450.13.1,akk.r00033,akk.r00037
5,i-ta-ba-ka,abāku[lead away]V,saao/sargonletters:P334041.39.2,akk.r00033,akk.r00038
6,ta-bu-uk-šu₂-nu,abāku[lead away]V,saao/sargonletters:P334037.24.2,akk.r00033,akk.r00039
7,e-tab-la,abālu[dry (up)]V,saao/sargonletters:P313871.13.7,akk.r0003a,akk.r0003b
8,e-tab-lu,abālu[dry (up)]V,saao/sargonletters:P334316.15.3,akk.r0003a,akk.r0003c
9,a-ba-ti,abati[(meaning unknown)]N,saao/sargonletters:P313575.32.1,akk.r0000d,akk.r0000d


In [16]:
gloss_qpn_file = codecs.open('sargonletters/gloss-qpn.json','r','utf-8')
gloss_qpn_json = json.load(gloss_qpn_file)
entries_qpn = gloss_qpn_json['entries']
instances_qpn = gloss_qpn_json['instances']

entries_list = []
for entry in entries_qpn:
    entry_info = {'headword': entry['headword'], 'xis_entry': entry['xis']}
    for form in entry['forms']:
        xis_form = form['xis']
        form_info = {'form': form['n'], 'xis_form': xis_form}
        form_info.update(entry_info)
        for inst in instances_qpn[xis_form]:
            instance_info = {'instance': inst}
            instance_info.update(form_info)
            entries_list.append(instance_info)
        
df_instances_qpn = pd.DataFrame(entries_list)
df_instances_qpn

Unnamed: 0,form,headword,instance,xis_entry,xis_form
0,{KUR}ʾa-ta-a.a,ʾAtaya[ʾAtean]EN,saao/sargonletters:P313425.28.2,qpn.r006c5,qpn.r006c5
1,{KUR}ʾa-ta-a.a,ʾAtaya[ʾAtean]EN,saao/sargonletters:P334322.50.5,qpn.r006c5,qpn.r006c5
2,{URU}a-ba-a,Aba[1]GN,saao/sargonletters:P334350.16.1,qpn.r00000,qpn.r00001
3,{URU}a-ba-a.a,Aba[1]GN,saao/sargonletters:P334350.8.3,qpn.r00000,qpn.r00002
4,{1}a-ba-li-u₂-qu-nu,Abaliuqunu[1]PN,saao/sargonletters:P313422.9.2,qpn.r00003,qpn.r00004
5,{1}a-ba-lu-qu-nu,Abaliuqunu[1]PN,saao/sargonletters:P334257.10.1,qpn.r00003,qpn.r00005
6,{1}ab-li-uq-nu,Abaliuqunu[1]PN,saao/sargonletters:P334090.15.3,qpn.r00003,qpn.r00006
7,{1}ab-li-uq-nu,Abaliuqunu[1]PN,saao/sargonletters:P334090.17.1,qpn.r00003,qpn.r00006
8,{1}a-ba-tu₂,Abattu[1]PN,saao/sargonletters:P313425.53.4,qpn.r0000a,qpn.r0000a
9,{1}a-bat-LUGAL-PAB,Abat-šarri-uṣur[1]PN,saao/sargonletters:P334321.19.1,qpn.r00007,qpn.r00008


In [17]:
df_instances_all = pd.concat([df_instances_akkx,df_instances_qpn])
df_instances_all.to_csv('output/instances_all.csv',encoding='utf-8')
df_instances_all

Unnamed: 0,form,headword,instance,xis_entry,xis_form
0,a-ba-ak,abāku[lead away]V,saao/sargonletters:P314281.18.2,akk.r00033,akk.r00034
1,a-tab-ka,abāku[lead away]V,saao/sargonletters:P313450.15.3,akk.r00033,akk.r00035
2,e-tab-ku,abāku[lead away]V,saao/sargonletters:P313753.19.1,akk.r00033,akk.r00036
3,e-tab-ku,abāku[lead away]V,saao/sargonletters:X900012.20.9,akk.r00033,akk.r00036
4,e-tab-ku-u-ni,abāku[lead away]V,saao/sargonletters:P313450.13.1,akk.r00033,akk.r00037
5,i-ta-ba-ka,abāku[lead away]V,saao/sargonletters:P334041.39.2,akk.r00033,akk.r00038
6,ta-bu-uk-šu₂-nu,abāku[lead away]V,saao/sargonletters:P334037.24.2,akk.r00033,akk.r00039
7,e-tab-la,abālu[dry (up)]V,saao/sargonletters:P313871.13.7,akk.r0003a,akk.r0003b
8,e-tab-lu,abālu[dry (up)]V,saao/sargonletters:P334316.15.3,akk.r0003a,akk.r0003c
9,a-ba-ti,abati[(meaning unknown)]N,saao/sargonletters:P313575.32.1,akk.r0000d,akk.r0000d


In [18]:
df_forms_all = pd.DataFrame(df_instances_all.groupby(['headword','xis_entry','form','xis_form']).agg({'instance': lambda x: ','.join(x)})).reset_index()
df_forms_all.columns = ['headword','xis_entry','form','xis_form','instances_all']
df_forms_all.to_csv('output/forms_all.csv',encoding='utf-8')
df_forms_all

Unnamed: 0,headword,xis_entry,form,xis_form,instances_all
0,Aba[1]GN,qpn.r00000,{URU}a-ba-a,qpn.r00001,saao/sargonletters:P334350.16.1
1,Aba[1]GN,qpn.r00000,{URU}a-ba-a.a,qpn.r00002,saao/sargonletters:P334350.8.3
2,Abaliuqunu[1]PN,qpn.r00003,{1}a-ba-li-u₂-qu-nu,qpn.r00004,saao/sargonletters:P313422.9.2
3,Abaliuqunu[1]PN,qpn.r00003,{1}a-ba-lu-qu-nu,qpn.r00005,saao/sargonletters:P334257.10.1
4,Abaliuqunu[1]PN,qpn.r00003,{1}ab-li-uq-nu,qpn.r00006,"saao/sargonletters:P334090.15.3,saao/sargonlet..."
5,Abat-šarri-uṣur[1]PN,qpn.r00007,{1}a-bat-LUGAL-PAB,qpn.r00008,saao/sargonletters:P334321.19.1
6,Abat-šarri-uṣur[1]PN,qpn.r00007,{1}a-bat-MAN-PAB,qpn.r00009,saao/sargonletters:P334282.3.2
7,Abattu[1]PN,qpn.r0000a,{1}a-ba-tu₂,qpn.r0000a,saao/sargonletters:P313425.53.4
8,Abdudaya[Abdudaean]EN,qpn.r0000b,{LU₂}{URU}ab-du-da-a.a,qpn.r0000b,saao/sargonletters:P334504.29.2
9,Abdudu[1]GN,qpn.r0000c,{URU}ab-du-di,qpn.r0000c,saao/sargonletters:P334104.8.2


In [19]:
df_headwords_all = pd.DataFrame(df_instances_all.groupby(['headword','xis_entry']).apply(lambda x: ','.join(x['instance'] + ' (' + x['form'] + ')'))).reset_index()
df_headwords_all.columns = ['headword','xis_entry','instances_all']
df_headwords_all.to_csv('output/headwords_all.csv',encoding='utf-8')
df_headwords_all

Unnamed: 0,headword,xis_entry,instances_all
0,Aba[1]GN,qpn.r00000,"saao/sargonletters:P334350.16.1 ({URU}a-ba-a),..."
1,Abaliuqunu[1]PN,qpn.r00003,saao/sargonletters:P313422.9.2 ({1}a-ba-li-u₂-...
2,Abat-šarri-uṣur[1]PN,qpn.r00007,saao/sargonletters:P334321.19.1 ({1}a-bat-LUGA...
3,Abattu[1]PN,qpn.r0000a,saao/sargonletters:P313425.53.4 ({1}a-ba-tu₂)
4,Abdudaya[Abdudaean]EN,qpn.r0000b,saao/sargonletters:P334504.29.2 ({LU₂}{URU}ab-...
5,Abdudu[1]GN,qpn.r0000c,saao/sargonletters:P334104.8.2 ({URU}ab-du-di)
6,Abi-Seʾ[1]PN,qpn.r0000d,saao/sargonletters:P313421.7.6 ({1}a-bi-si)
7,Abi-ramu[1]PN,qpn.r0000e,saao/sargonletters:P334412.16.1 ({1}AD-ra-me)
8,Abi-yaqa[1]PN,qpn.r0000f,saao/sargonletters:P334187.10.3 ({1}a-bi-ia-qa-a)
9,Abile[1]PN,qpn.r00010,saao/sargonletters:P334398.14.2 ({1}a-bi-le-e)


<h3>1.1: Collection of Modified Signs</h3>
<p>The Data Frame we have contains the entire collection of signs in the corpus. However, not every sign has variants in paleography (at least according to Parpola's data input). We only want to look at the signs which have these variants, which we will term diagnostic. In the data, they are the signs that include any type of modification</p>

In [None]:
df2 = df[~(df['f'] == '') | ~(df['a'] == '') | ~(df['m'] == '')]
mod_signs = sorted(list(df2['sign_form'].unique()))
mod_signs

We now limit our Data Frame to include ONLY these diagnostic signs.

In [None]:
df_modsigns = df[df['sign_form'].isin(mod_signs)]
#remove damaged signs too
df_modsigns = df_modsigns[df_modsigns['break'] != 'damaged']
df_modsigns['combined'] = df_modsigns['sign_form'] + ':' + df_modsigns['mods_str']
df_modsigns

<b>1.1.1</b> Let's take a moment now to generate a list of diagnostic signs with their modifications, listing all their locations in the corpus

In [None]:
df_onlymods = df_modsigns[(df_modsigns['mods_str'] != '..')]
df_onlymods

Let's export a sorted version of this data frame

In [None]:
df_onlymods_sorted = df_onlymods.sort_values(['sign_form','f','a','m'])
df_onlymods_sorted = df_onlymods_sorted[['sign_loc_id','sign_form','f','a','m','b','frag','text_id']]
df_onlymods_sorted.to_csv('output/sign_mods_all.csv',encoding='utf-8')
df_onlymods_sorted

<h3>1.2: Collection of Homophonous Signs</h3>
<p>We now limit the original data frame in different way based on orthography. First we need to figure out which syllabic readings have multiple signs that can render them.</p>

In [None]:
df2 = pd.DataFrame(df.groupby(['str_part'])['num_part'].agg('nunique'))
ortho_list = list(df2[df2[('num_part')] > 1].index)
ortho_list

We need to eliminate capital letter entries because indices on logograms indicate different words and are not relevant here.

In [None]:
ortho_list = [h for h in ortho_list if len(re.findall(r'[A-Z]',h)) == 0]
ortho_list

Limit the dataframe to only these signs

In [None]:
df_ortho_signs = df[df['str_part'].isin(ortho_list)]
df_ortho_signs

<h3>2. Mixed vs. Complementary Distribution</h3>
<p>One of the goals of this project is to determine a preference for sign usage in one subgroup of the corpus versus another. To that end there is one more factor that needs to be discussed, namely the usage of these paleographic or orthographic variants within context. If the usage of these variants are context-dependent, meaning that one form or syllable is used in one context and another form or syllable in another context, it does not tell us much about the preferential usage of the signs. This is known as a complementary distribution. For example, if a scribe uses <i>li<sub>2</sub></i> only in the form of the word be-li<sub>2</sub> and the <i>li</i> sign in all other contexts, the choice of sign usage is not determined by the scribe's preference rather on scribal convention. This convention would thus be utilized by every scribe of this corpus and not help us to detect subgroups among these texts where scribes differ.</p>
<p>On the other hand, if sign form or syllable variants appear within the same contexts, it gives us the information we want on scribal writing preference or tendencies. For example, <i>ia</i> and <i>ia<sub>2</sub></i> both appear in forms of the word bēliya, meaning that a scribe had an option of orthography and incised one or the other. (NTS: I'm avoiding the term "choose" here because it is a very loaded term with implications that may be misleading here). The question then becomes whether certain texts group together based on their tendencies to use one variant within a mixed distribution versus another variant.</p>
<p>(paragraph about this dichotomy on the paleographic side of things. Mention TA vs. TA@v)</p>
<p>(closing paragraph summarizing the issue)</p>

<b>2.1 Paleographic Variant Distribution</b>

Now let's try to apply a quantitative method to figure out the level of mixed distribution which paleographic variants bear within word forms. The steps here are:
<ol>
    <li>Select only the particular sign forms and modifications that appear a sufficient number of times within the same forms</li>
    <li>Count the number of times these sign forms and modiciations occur in each text</li>
    <li>Create a text matrix which contains the appropriate distribution for the modifications within each sign form, adding 1 to each cell to avoid divide by zero issues</li>
</ol>

In [None]:
df_mods_agg = pd.DataFrame(df_modsigns.groupby(['sign_form','form','mods_str'])['a'].agg('count')).reset_index()
df_mods_agg.columns = ['sign_form','form','mods_str','count']
#first let's remove where total instances are less than a certain arbitrary value, say 5
df_mods_agg = df_mods_agg[df_mods_agg['count'] >= 5]
#NOW find and only keep the rows where sign_form and form are duplicates
df_mods_agg['is_dup'] = df_mods_agg.duplicated(['sign_form','form'],False)
df_mods_agg = df_mods_agg[df_mods_agg['is_dup'] == True]
df_mods_agg

In [None]:
df_select_signmods = df_mods_agg[['sign_form','mods_str']].drop_duplicates()
df_select_signmods['combined'] = df_select_signmods['sign_form'] + ':' + df_select_signmods['mods_str']
df_select_signmods

Create a list of the selected signs to limit the main paleography dataframe to only those sign forms

In [None]:
select_signs = list(df_mods_agg['sign_form'].unique())
select_signs

In [None]:
df_file_select_signs = df_modsigns[df_modsigns['sign_form'].isin(select_signs)]
df_file_select_signs = pd.DataFrame(df_file_select_signs.groupby(['file','sign_form','mods_str'])['a'].agg('count')).reset_index()
df_file_select_signs['combined'] = df_file_select_signs['sign_form'] + ':' + df_file_select_signs['mods_str']
df_file_select_signs

Create the text matrix according to step 3 above

In [None]:
d_file_select_signs = {}
for f in file_names:
    df_onefile_select_signs = df_file_select_signs[(df_file_select_signs['file'] == f)]
    d = {}
    e = {}
    for i, row in df_select_signmods.iterrows():
        try:
            n = int(df_onefile_select_signs[(df_onefile_select_signs['combined'] == row['combined'])]['a']) + 1
        except TypeError:
            n = 1
            
        d[row['combined']] = n
        if row['sign_form'] in e:
            e[row['sign_form']] += n
        else:
            e[row['sign_form']] = n
        
    #d_select_signs[f] = [d['ia'] / ia_tot,d['ia₂'] / ia_tot,d['li'] / li_tot,d['li₂'] / li_tot,d['ša'] / sa_tot, d['ša₂'] / sa_tot,d['šu'] / su_tot,d['šu₂'] / su_tot]
    d_file_select_signs[f] = []
    for i,row in df_select_signmods.iterrows():
        d_file_select_signs[f].append(d[row['combined']] / e[row['sign_form']])
df_file_select_signs_c = pd.DataFrame(d_file_select_signs).transpose()
df_file_select_signs_c.columns = list(df_select_signmods['combined'])
df_file_select_signs_c

<b>2.1.2. Clustering on Paleography Alone</b>

Try Elbow Method on Paleography Alone

In [None]:
distortions = []
differences = [0]
deceleration = [0,0]
K = range(1,50)
for k in K:
    km = KMeans(n_clusters=k).fit(df_file_select_signs_c)
    distortions.append(km.inertia_)
    if k > 1:
        differences.append(distortions[k-1] - distortions[k-2])
    if k > 2:
        deceleration.append(differences[k-1] - differences[k-2])
        
#print(K)
#print(distortions)
#print(differences)
#print(deceleration)
    
plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,distortions,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbow Method')
plt.savefig('output/elbow_paleo_reg.png')
plt.show()

plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,differences,'o-')
plt.plot(K,deceleration,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Differences')
plt.title('Difference Measure')
plt.savefig('output/elbow_paleo_diff.png')
plt.show()


Let's look now at the silhouette score

In [None]:
sil_scores = []
K = range(2,70)
for k in K:
    km = KMeans(n_clusters=k).fit(df_file_select_signs_c)
    sil_score = silhouette_score(df_file_select_signs_c,labels=km.labels_)
    sil_scores.append(sil_score)
        
plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,sil_scores,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Avg Silhouette')
plt.title('Silhouette Scores')
plt.savefig('output/silhouette_paleo_reg.png')
plt.show()

In [None]:
km1 = KMeans(n_clusters=14, max_iter=1000).fit(df_file_select_signs_c)

labels_paleo = {}

km1.labels_

for i in range(len(km1.labels_)):
    if km1.labels_[i] in labels_paleo:
        labels_paleo[km1.labels_[i]].append(file_names[i])
    else:
        labels_paleo[km1.labels_[i]] = [file_names[i]]
labels_paleo

#Sennacherib the Prince
sar = ['P334141.json','P334390.json']
#Nabu-pašir, governor of Harran
npr = ['P334807.json','P334080.json']
#Nabu-deʾiq
nd = ['P334568.json','P334792.json']

def find_cluster(pnum,labels):
    for k in labels:
        if pnum in labels[k]:
            return str(k)
        
print('Sennacherib clusters are: ',find_cluster(sar[0],labels_paleo),' and ',find_cluster(sar[1],labels_paleo))
print('Nabu-pašir clusters are: ',find_cluster(np[0],labels_paleo),' and ',find_cluster(np[1],labels_paleo))
print('Nabu-deʾiq clusters are: ',find_cluster(nd[0],labels_paleo),' and ',find_cluster(nd[1],labels_paleo))

<b>2.2. Orthographic Variant Distribution</b>

In [None]:
df_ortho_signs['form_str_part'] = df_ortho_signs['form'].apply(lambda x: re.sub(r'[₁₂₃₄₅₆₇₈₉₀]','',x))
df_ortho_signs

In [None]:
df_syls_agg = pd.DataFrame(df_ortho_signs.groupby(['str_part','form_str_part','b'])['a'].agg('count')).reset_index()
df_syls_agg.columns = ['str_part','form_str_part','b','count']
#first let's remove where total instances are less than a certain arbitrary value, say 5
df_syls_agg = df_syls_agg[df_syls_agg['count'] >= 5]
#NOW find and only keep the rows where sign_form and form are duplicates
df_syls_agg['is_dup'] = df_syls_agg.duplicated(['str_part','form_str_part'],False)
df_syls_agg = df_syls_agg[df_syls_agg['is_dup'] == True]
df_syls_agg

In [None]:
df_select_bs = df_syls_agg[['str_part','b']].drop_duplicates()
#Don't need to create combined column here because b is sufficient
#df_select_signmods['combined'] = df_select_signmods['sign_form'] + ':' + df_select_signmods['mods_str']
df_select_bs

In [None]:
select_syls = list(df_syls_agg['str_part'].unique())
select_syls

In [None]:
df_file_select_bs = df_ortho_signs[df_ortho_signs['str_part'].isin(select_syls)]
df_file_select_bs = pd.DataFrame(df_file_select_bs.groupby(['file','str_part','b'])['a'].agg('count')).reset_index()
#Again combined is just b
#df_file_select_syls['combined'] = df_file_select_signs['sign_form'] + ':' + df_file_select_signs['mods_str']
df_file_select_bs

In [None]:
d_file_select_syls = {}
for f in file_names:
    df_onefile_select_bs = df_file_select_bs[(df_file_select_bs['file'] == f)]
    d = {}
    e = {}
    for i, row in df_select_bs.iterrows():
        try:
            n = int(df_onefile_select_bs[(df_onefile_select_bs['b'] == row['b'])]['a']) + 1
        except TypeError:
            n = 1
            
        d[row['b']] = n
        if row['str_part'] in e:
            e[row['str_part']] += n
        else:
            e[row['str_part']] = n
        
    #d_select_signs[f] = [d['ia'] / ia_tot,d['ia₂'] / ia_tot,d['li'] / li_tot,d['li₂'] / li_tot,d['ša'] / sa_tot, d['ša₂'] / sa_tot,d['šu'] / su_tot,d['šu₂'] / su_tot]
    d_file_select_syls[f] = []
    for i,row in df_select_bs.iterrows():
        d_file_select_syls[f].append(d[row['b']] / e[row['str_part']])
df_file_select_syls_c = pd.DataFrame(d_file_select_syls).transpose()
df_file_select_syls_c.columns = list(df_select_bs['b'])
df_file_select_syls_c

<b>2.2.2 Cluster using K-Means</b>

Start with Elbow Method

In [None]:
distortions = []
differences = [0]
deceleration = [0,0]
K = range(1,50)
for k in K:
    km = KMeans(n_clusters=k).fit(df_file_select_syls_c)
    distortions.append(km.inertia_)
    if k > 1:
        differences.append(distortions[k-1] - distortions[k-2])
    if k > 2:
        deceleration.append(differences[k-1] - differences[k-2])
        
#print(K)
#print(distortions)
#print(differences)
#print(deceleration)
    
plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,distortions,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbow Method')
plt.savefig('output/elbow_ortho_reg.png')
plt.show()

plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,differences,'o-')
plt.plot(K,deceleration,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Differences')
plt.title('Difference Measure')
plt.savefig('output/elbow_ortho_diff.png')
plt.show()


Silhouette Again

In [None]:
sil_scores = []
K = range(2,70)
for k in K:
    km = KMeans(n_clusters=k).fit(df_file_select_syls_c)
    sil_score = silhouette_score(df_file_select_syls_c,labels=km.labels_)
    sil_scores.append(sil_score)
        
plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,sil_scores,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Avg Silhouette')
plt.title('Silhouette Scores')
plt.savefig('output/silhouette_ortho_reg.png')
plt.show()

In [None]:
km1 = KMeans(n_clusters=7, max_iter=1000).fit(df_file_select_syls_c)

labels_ortho = {}

km1.labels_

for i in range(len(km1.labels_)):
    if km1.labels_[i] in labels_ortho:
        labels_ortho[km1.labels_[i]].append(file_names[i])
    else:
        labels_ortho[km1.labels_[i]] = [file_names[i]]
labels_ortho

#Let's examine some test cases. We'll select three pairs of texts, which we would expect to cluster always in the same way.
#Sennacherib the Prince
sar = ['P334141.json','P334390.json']
#Nabu-pašir, governor of Harran
npr = ['P334807.json','P334080.json']
#Nabu-deʾiq
nd = ['P334568.json','P334792.json']

def find_cluster(pnum,labels):
    for k in labels:
        if pnum in labels[k]:
            return str(k)

print('Sennacherib clusters are: ',find_cluster(sar[0],labels_ortho),' and ',find_cluster(sar[1],labels_ortho))
print('Nabu-pašir clusters are: ',find_cluster(np[0],labels_ortho),' and ',find_cluster(np[1],labels_ortho))
print('Nabu-deʾiq clusters are: ',find_cluster(nd[0],labels_ortho),' and ',find_cluster(nd[1],labels_ortho))

Combine Orthography and Paleography

In [None]:
tm_all = pd.concat([df_file_select_syls_c,df_file_select_signs_c],axis=1)
tm_all.to_csv('output/full_matrix.csv',encoding='utf-8')
tm_all

<b>Elbow Method</b>

In [None]:
distortions = []
differences = [0]
deceleration = [0,0]
K = range(1,50)
for k in K:
    km = KMeans(n_clusters=k).fit(tm_all)
    distortions.append(km.inertia_)
    if k > 1:
        differences.append(distortions[k-1] - distortions[k-2])
    if k > 2:
        deceleration.append(differences[k-1] - differences[k-2])
        
#print(K)
#print(distortions)
#print(differences)
#print(deceleration)
    
plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,distortions,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbow Method')
plt.savefig('output/elbow_both_reg.png')
plt.show()

plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,differences,'o-')
plt.plot(K,deceleration,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Differences')
plt.title('Difference Measure')
plt.savefig('output/elbow_both_diff.png')
plt.show()


Silhouette Again

In [None]:
sil_scores = []
K = range(2,70)
for k in K:
    km = KMeans(n_clusters=k).fit(tm_all)
    sil_score = silhouette_score(tm_all,labels=km.labels_)
    sil_scores.append(sil_score)
        
plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')

plt.plot(K,sil_scores,'o-')
plt.xticks(K)
plt.xlabel('k')
plt.ylabel('Avg Silhouette')
plt.title('Silhouette Scores')
plt.savefig('output/silhouette_both_reg.png')
plt.show()

In [None]:
km1 = KMeans(n_clusters=14, max_iter=1000).fit(tm_all)

labels_all = {}

km1.labels_

for i in range(len(km1.labels_)):
    if km1.labels_[i] in labels_all:
        labels_all[km1.labels_[i]].append(file_names[i])
    else:
        labels_all[km1.labels_[i]] = [file_names[i]]
labels_all

#Sennacherib the Prince
sar = ['P334141.json','P334390.json']
#Nabu-pašir, governor of Harran
npr = ['P334807.json','P334080.json']
#Nabu-deʾiq
nd = ['P334568.json','P334792.json']

def find_cluster(pnum,labels):
    for k in labels:
        if pnum in labels[k]:
            return str(k)

print('Sennacherib clusters are: ',find_cluster(sar[0],labels_all),' and ',find_cluster(sar[1],labels_all))
print('Nabu-pašir clusters are: ',find_cluster(np[0],labels_all),' and ',find_cluster(np[1],labels_all))
print('Nabu-deʾiq clusters are: ',find_cluster(nd[0],labels_all),' and ',find_cluster(nd[1],labels_all))

<p>The two examples from Sennacherib the prince tend to cluster together BUT letters from other places do not group together according to paleographic and orthographic preferences in those letters. Why should this be? Here are some options</p>
<ol>
    <li>Scribal usage of different paleographies and orthographies is not based on a certain preference either consiously or unconsiously. In other words, for any given scribe, free variation reigns supreme (expand on this). On the other hand, the letters from Sennacherib do represent a particular style, perhaps due to his station</li>
    <li>Paleographic and Orthographic variation CAN indicate scribal tendencies, BUT computational methods are insufficient to determine this because machine learning algorithms require large amounts of data and the letters simply do not provide enough data. If so, we must ask the question why it works for Sennacherib but not the others</li>
    <li>There is a problem with my methodology. Maybe I set up the text vectors incorrectly. Maybe I should include more orthographies/paleographies or perhaps less. Maybe the number of clusters selected is wrong.</li>
</ol>

<p>Something else to keep in mind here is that while I limited the number of signs to be considered in the text vectors, I did not restrict any text from being in the corpus. Perhaps I should do that. Maybe certain texts are simply too short to make any determinations on its grouping among the other texts.</p>

Visualize with MDS

In [None]:
from sklearn.manifold import MDS

texts_2d_map = {}
texts = tm_all.index

mds1 = MDS(n_components = 2)
texts_2d = mds1.fit_transform(tm_all)

In [None]:
color_list = ['white','yellow','green','red','blue','brown','black']
colors_all = []
for i in range(len(km1.labels_)):
    colors_all.append(color_list[km1.labels_[i] % 7])
colors_all

In [None]:
plt.figure(num=None, figsize=(16, 16), dpi=80, facecolor='w', edgecolor='k')

x_values = [xy[0] for xy in texts_2d]
y_values = [xy[1] for xy in texts_2d]
plt.scatter(x_values,y_values,c=colors_all)
for i in range(len(texts_2d)):
    plt.annotate(texts[i],(x_values[i],y_values[i]))
plt.show()

<h2>Classification</h2>

We start with an initial classification assumption that letters from the same location will cluster in the same groups. We can use the catalogue.json files to get information on the sender locations as well as the sender

In [None]:
cat_file = codecs.open('sargonletters/catalogue.json','r','utf-8')
cat_json = json.load(cat_file)

class_l = []
class_index = []
for pnum in cat_json['members']:
    id_text        = cat_json['members'][pnum].get('id_text','')
    designation    = cat_json['members'][pnum].get('designation','')
    ancient_author = cat_json['members'][pnum].get('ancient_author','')
    dossier        = cat_json['members'][pnum].get('dossier','')
    senderloc     = cat_json['members'][pnum].get('senderloc','')
    class_d = {'designation': designation,'ancient_author':ancient_author,'dossier':dossier,'senderloc':senderloc}
    class_index.append(id_text)
    class_l.append(class_d)
    
df_class = pd.DataFrame(class_l,index=class_index)
df_class

In [None]:
senderloc_list = df_class['senderloc'].unique()
print('There are ' + str(len(senderloc_list)) + ' sender locations.')
author_list = df_class['ancient_author'].unique()
print('There are ' + str(len(author_list)) + ' ancient authors')

---BREAK---

In [None]:
df_select_signs_tot = df[df['sign_form'].isin(['NI','NA','LUGAL','MA','ŠA'])]
df_select_tot = pd.DataFrame(df_select_signs_tot.groupby(['sign_form','mods_str']).agg('count'))
df_select_tot
#df_select_tot.sort_values(by=['a'],ascending=[False])

In [None]:
df_file_twofeats = df_file_select_signs_c[['NA:..','NI:..']]
df_file_twofeats

In [None]:
df_class_feats = pd.concat([df_class,df_file_twofeats],axis=1)
df_class_feats

In [None]:
color_list = ['white','yellow','green','red','blue','brown','black']
marker_list = ['o','v','^','8','s','*','+','D','h']

i = 0
sender_colors = {}
sender_markers = {}
for c in color_list:
    for m in marker_list:
        try:
            sender_colors[senderloc_list[i]] = c
            sender_markers[senderloc_list[i]] = m
            i += 1
        except IndexError:
            break
            
df_class_feats['color'] = df_class_feats['senderloc'].map(sender_colors)
df_class_feats['marker'] = df_class_feats['senderloc'].map(sender_markers)
df_class_feats

In [None]:
#Top senderlocs
pd.DataFrame(df_class_feats.groupby(['senderloc'])['dossier'].agg('count')).sort_values(by='dossier',ascending=[False])

In [None]:
senderlocs_top5 = ['Royal Court','Northeastern Assyria','Assyria','Ashur','Central or Southern Babylonia']

In [None]:
plt.figure(num=None, figsize=(16, 16), dpi=120, facecolor='w', edgecolor='k')
for i, row in df_class_feats.iterrows():
    if row['senderloc'] in senderlocs_top5:
        plt.scatter(row['NA:..'],row['NI:..'],c=row['color'],marker=row['marker'])
plt.show()

Which sign forms and syllables work the best to group texts by their sender location?

In [None]:
tm_all_class = pd.concat([tm_all,df_class],axis=1)
tm_all_class

In [None]:
df_class_var = pd.DataFrame(tm_all_class.groupby('senderloc').agg('var'))
df_class_var

In [None]:
df_varsum = df_class_var.apply(lambda x:x**2)
df_varsum = pd.DataFrame(df_varsum.agg('sum'))
df_varsum.to_csv('output/varsum.csv',encoding='utf-8',sep='\t')

In [None]:
df_mod_count = pd.DataFrame(df_modsigns.groupby('sign_form')['a'].agg('count'))
df_mod_count.columns = ['count']
df_mod_count.sort_values(by='count',ascending=False)

In [None]:
df_ortho_count = pd.DataFrame(df_ortho_signs.groupby('str_part')['a'].agg('count'))
df_ortho_count.columns = ['count']
df_ortho_count.sort_values(by='count',ascending=False)

Let's attempt to see which sign forms or orthographies determine the clusters the best. We will count up the occurrences like we did before but for every sign form and syllable. We will then find the center of each class and calculate the sum of squares within each class and between classes for one sign_form or syllable

In [None]:
df_ortho_str = pd.DataFrame(df_ortho_signs.groupby(['text_id']).apply(lambda x: ' '.join(x['b'])))
df_ortho_str.columns = ['ortho_str']
df_ortho_str

In [None]:
cv = CountVectorizer(token_pattern='[^ ]+')
ft = cv.fit_transform(list(df_ortho_str['ortho_str']))
tm_ortho = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_ortho_str.index)
tm_ortho

In [None]:
df_ortho_map = pd.DataFrame(df_ortho_signs.groupby(['str_part']).apply(lambda x: ' '.join(x['b'].unique()).split()))
map_ortho = df_ortho_map.to_dict()[0]
map_ortho

In [None]:
d = {}
vecs = {}
for i, row in tm_ortho.iterrows():
    d[i] = {}
    for syl in map_ortho:
        syl_sum = np.sum(tm_ortho.loc[i][map_ortho[syl]])
        for b in map_ortho[syl]:
            if syl_sum > 0:
                d[i][b] = tm_ortho.loc[i][b] / syl_sum
            else:
                d[i][b] = np.nan
                
tm_ortho_dist = pd.DataFrame(d).transpose()
tm_ortho_dist

In [None]:
tm_ortho_sender = pd.concat([tm_ortho_dist,df_class],axis=1)
tm_ortho_sender

In [None]:
tm_ortho_sender_var = tm_ortho_sender.groupby('senderloc').agg(np.nanvar)
tm_ortho_sender_var

In [None]:
tm_ortho_all_var = pd.DataFrame(tm_ortho_dist.apply(np.nanvar))
tm_ortho_all_var.columns = ['var_all']

In [None]:
tm_ortho_sender_varsum = pd.DataFrame(tm_ortho_sender_var.agg(np.nansum))
tm_ortho_sender_varsum.columns = ['var_sender']
tm_ortho_sender_varsum

In [None]:
df_ortho_bcount = pd.DataFrame(df_ortho_signs.groupby('b')['a'].agg('count'))
#df_ortho_bcount.index = df_ortho_bcount['b']
df_ortho_bcount.columns = ['bcount']
df_ortho_bcount

In [None]:
tm_ortho_varsum = pd.concat([tm_ortho_all_var,tm_ortho_sender_varsum,df_ortho_bcount],axis=1)
tm_ortho_varsum.to_csv('output/ortho_vars.csv',encoding='utf-8',sep='\t')

Try the same with paleography

In [None]:
df_paleo_str = pd.DataFrame(df_modsigns.groupby(['text_id']).apply(lambda x: ' '.join(x['combined'])))
df_paleo_str.columns = ['paleo_str']
df_paleo_str

In [None]:
cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_paleo_str['paleo_str']))
tm_paleo = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_paleo_str.index)
tm_paleo

In [None]:
df_paleo_map = pd.DataFrame(df_modsigns.groupby(['sign_form']).apply(lambda x: ' '.join(x['combined'].unique()).split()))
map_paleo = df_paleo_map.to_dict()[0]
map_paleo

In [None]:
d = {}
vecs = {}
for i, row in tm_paleo.iterrows():
    d[i] = {}
    for sform in map_paleo:
        form_sum = np.sum(tm_paleo.loc[i][map_paleo[sform]])
        for c in map_paleo[sform]:
            if form_sum > 0:
                d[i][c] = tm_paleo.loc[i][c] / form_sum
            else:
                d[i][c] = np.nan
                
tm_paleo_dist = pd.DataFrame(d).transpose()
tm_paleo_dist

In [None]:
tm_paleo_sender = pd.concat([tm_paleo_dist,df_class],axis=1)
tm_paleo_sender

In [None]:
tm_paleo_sender_var = tm_paleo_sender.groupby('senderloc').agg(np.nanvar)
tm_paleo_sender_var

In [None]:
tm_paleo_all_var = pd.DataFrame(tm_paleo_dist.apply(np.nanvar))
tm_paleo_all_var.columns = ['var_all']

In [None]:
tm_paleo_sender_varsum = pd.DataFrame(tm_paleo_sender_var.agg(np.nansum))
tm_paleo_sender_varsum.columns = ['var_sender']
tm_paleo_sender_varsum

In [None]:
df_paleo_ccount = pd.DataFrame(df_modsigns.groupby('combined')['a'].agg('count'))
df_paleo_ccount.columns = ['ccount']
df_paleo_ccount

In [None]:
tm_paleo_varsum = pd.concat([tm_paleo_all_var,tm_paleo_sender_varsum,df_paleo_ccount],axis=1)
tm_paleo_varsum.to_csv('output/paleo_vars.csv',encoding='utf-8',sep='\t')

Now that we've selected our syllables and signs let's try to cluster using only those. Let's try orthography first.

In [None]:
map_ortho = {'ia':['ia','ia₂'], 'li':['li','li₂'], 'ša':['ša','ša₂'], 'šu':['šu','šu₂'], 'u':['u','u₂']}
list_ortho = []
for v in map_ortho.values():
    list_ortho = list_ortho + v
list_ortho

In [None]:
tm_ortho = tm_ortho[list_ortho].apply(lambda x: x+1)
tm_ortho

In [None]:
d = {}
vecs = {}
for i, row in tm_ortho.iterrows():
    d[i] = {}
    for syl in map_ortho:
        syl_sum = np.sum(tm_ortho.loc[i][map_ortho[syl]])
        for b in map_ortho[syl]:
            if syl_sum > 0:
                d[i][b] = tm_ortho.loc[i][b] / syl_sum
            else:
                d[i][b] = np.nan
                
tm_ortho_dist = pd.DataFrame(d).transpose()
tm_ortho_dist

In [None]:
cluster_groups = [['ia','ia₂'],['li','li₂'],['u','u₂'],['ša','ša₂'],['šu','šu₂'],list_ortho]

In [None]:
for g in cluster_groups:
    km = KMeans(n_clusters=62,max_iter=1000).fit(tm_ortho_dist[g])

    senders_clustered = {}
    for i in range(len(km.labels_)):
        if km.labels_[i] in senders_clustered:
            senders_clustered[km.labels_[i]].append(df_class.loc[tm_ortho_dist.index[i]]['senderloc'])
        else:
            senders_clustered[km.labels_[i]] = [df_class.loc[tm_ortho_dist.index[i]]['senderloc']]

    #purity score
    purity_score = 0
    for c in senders_clustered:
        cnt = Counter(senders_clustered[c])
        purity_score += cnt.most_common()[0][1]

    purity_score = purity_score / len(df_class.index)
    print(str(g) + ': ' + str(purity_score))
    
    #Tack on Silhouette
    print('Silhouette: ' + str(silhouette_score(tm_ortho_dist[g],labels=km.labels_)))

Do same for paleography

In [None]:
map_paleo = {'BU':['BU:..','BU:.p.'], 'DI':['DI:..','DI:.d.'], 'LI':['LI:..','LI:.d.'], 'NA':['NA:..','NA:.t.'], 'NI':['NI:..','NI:.d.'], 'RU':['RU:..','RU:.d.'], '|ME.U.U.U|':['|ME.U.U.U|:..','|ME.U.U.U|:.m.'], 'ŠA': ['ŠA:..','ŠA:.dm.']}
list_paleo = []
cluster_groups_paleo = []
for v in map_paleo.values():
    cluster_groups_paleo.append(v)
    list_paleo = list_paleo + v
cluster_groups_paleo.append(list_paleo)
list_paleo

In [None]:
tm_paleo = tm_paleo[list_paleo].apply(lambda x: x+1)
tm_paleo

In [None]:
d = {}
vecs = {}
for i, row in tm_paleo.iterrows():
    d[i] = {}
    for syl in map_paleo:
        syl_sum = np.sum(tm_paleo.loc[i][map_paleo[syl]])
        for b in map_paleo[syl]:
            if syl_sum > 0:
                d[i][b] = tm_paleo.loc[i][b] / syl_sum
            else:
                d[i][b] = np.nan
                
tm_paleo_dist = pd.DataFrame(d).transpose()
tm_paleo_dist

In [None]:
for g in cluster_groups_paleo:
    km = KMeans(n_clusters=62,max_iter=1000).fit(tm_paleo_dist[g])

    senders_clustered = {}
    for i in range(len(km.labels_)):
        if km.labels_[i] in senders_clustered:
            senders_clustered[km.labels_[i]].append(df_class.loc[tm_paleo_dist.index[i]]['senderloc'])
        else:
            senders_clustered[km.labels_[i]] = [df_class.loc[tm_paleo_dist.index[i]]['senderloc']]

    #purity score
    purity_score = 0
    for c in senders_clustered:
        cnt = Counter(senders_clustered[c])
        purity_score += cnt.most_common()[0][1]

    purity_score = purity_score / len(df_class.index)
    print(str(g) + ': ' + str(purity_score))
    
    #Tack on Silhouette
    print('Silhouette: ' + str(silhouette_score(tm_paleo_dist[g],labels=km.labels_)))

Now let's take a look at the writing of words of the same lemma and normalization and how they are written

In [None]:
df_words['lemma'] = df_words['cf'] + '[' + df_words['gw'] + ']' + df_words['pos']
df_norm_tot = pd.DataFrame(df_words.groupby(['lemma','norm'])['cf'].agg('count'))
df_norm_tot.columns = ['norm_count']
df_norm_uniq = pd.DataFrame(df_words.groupby(['lemma','norm'])['form'].nunique())
df_norm_uniq.columns = ['norm_uniq']
df_norm_info = pd.concat([df_norm_tot,df_norm_uniq],axis=1)
df_norm_info

In [None]:
df_norm_info[(df_norm_info['norm_count'] > 500) & (df_norm_info['norm_uniq'] > 1) & (df_norm_info['norm_uniq'] < 20)]

In [None]:
df_words[(df_words['lemma'] == 'ina[in]PRP') & (df_words['norm'] == 'ina')]

In [None]:
df_norm_uniq = pd.DataFrame(df_words.groupby(['lemma','norm','form'])['cf'].agg('count'))
df_norm_uniq.to_csv('output/forms_unique.csv',encoding='utf-8',sep='\t')

Rare orthographies and paleographies

In [None]:
rare_ortho = ['a₂','ana₃','da₃','gal₃','i₃','ka₂','kam₂','ku₃','me₂','qi₂','ur₂']
rare_paleo = ['A:.d.','AB₂:.d.','AK:.dt.','AL:.y.','AMAR:.p.','BA:.p.','BAD:.m.','BI:.y.','DA:.y.','DI:.y.','DIN:.d.','DUN:.m.','DUN₃:.m.','DUN₄:.m.','E₂:.ym.','GA:.d.','GA:.p.']

In [None]:
df[df['b'].isin(rare_ortho)].sort_values(by='text_id')

Look also at words with plene writings vs. simple writings