In [1]:
import json
import codecs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
#pd.options.display.max_rows = 50
#pd.reset_option('display')

In [2]:
file_ogsl = codecs.open('ogsl-sl.json','r','utf-8')
ogsl = json.load(file_ogsl)
sign_index = ogsl['index']

In [3]:
def process_signs(sign_data):
    sign_info = {}
    if 'v' in sign_data: #This is the label for a standard syllable
        sign_info['b'] = sign_data['v']
    if 's' in sign_data: #This is the label for elements of a logogram
        sign_info['b'] = sign_data['s']
    if 'n' in sign_data:
        sign_info['b'] = sign_data.get('sexified',sign_data.get('form','noform?'))
    if 'mods' in sign_data:
        for m in sign_data['mods']:
            for d in m:
                sign_info[d] = m[d]
    if 'break' in sign_data:
        sign_info['break'] = sign_data['break']
    sign_info['sign_loc_id'] = sign_data.get('id','no-id')
    return sign_info    

In [4]:
types = set()
all_signs = []
all_words = []
for fname in os.listdir('sargonletters/corpusjson'):
    f = codecs.open('sargonletters/corpusjson/'+fname,'r','utf-8')
    try:
        j = json.load(f)
    except ValueError:
        print('Could not load: ' + fname)
        continue
    text_id = j['textid']
    for a in j['cdl'][0]['cdl']:
        if a.get('type','') == 'discourse':
            for b in a['cdl']:
                if b.get('type','') == 'sentence':
                    line_label = ''                    
                    for c in b['cdl']:
                        if c.get('node','') == 'd': #This is the label for the line e.g. "o ii 3"
                            line_label = c.get('label','nolabel')
                        if c.get('node','') == 'l': #This is the label for a regular word in a line
                            if c.get('tail-sig','') != '': #An extra word??
                                continue
                            form = c['f']['form']
                            frag = c['frag']
                            ref  = c['ref']
                            cf   = c['f'].get('cf','no-cf')
                            gw   = c['f'].get('gw','no-gw')
                            pos  = c['f']['pos']
                            sense = c['f'].get('sense','no-sense')
                            norm = c['f'].get('norm','no-norm')
                            epos = c['f'].get('epos','no-epos')
                            word_sign_tot = len(c['f']['gdl'])
                            word_info = {'file':fname,'line_label':line_label,'form': form,'frag': frag, 'text_id': text_id, 'ref': ref,'cf': cf,'gw': gw,'pos': pos,'epos':epos,'sense':sense,'word_sign_tot':word_sign_tot,'norm':norm}
                            all_words.append(word_info)
                            for sign_data in c['f']['gdl']:
                                if sign_data.get('det','') == 'semantic':
                                    for sd in sign_data['seq']:
                                        if sd.get('gg','') == 'logo':
                                            for g in sd['group']:
                                                sign_info = process_signs(g)
                                                sign_info.update(word_info)
                                                all_signs.append(sign_info)
                                        else:
                                            sign_info = process_signs(sd)
                                            sign_info.update(word_info)
                                            all_signs.append(sign_info)
                                elif sign_data.get('gg','') == 'logo':
                                    for g in sign_data['group']:
                                        if g.get('det','') == 'semantic':
                                            for sd in g['seq']:
                                                if sd.get('gg','') == 'logo':
                                                    for gg in sd['group']:
                                                        sign_info = process_signs(gg)
                                                        sign_info.update(word_info)
                                                        all_signs.append(sign_info)                                       
                                                else:
                                                    sign_info = process_signs(sd)
                                                    sign_info.update(word_info)
                                                    all_signs.append(sign_info)                                       
                                        else:
                                            sign_info = process_signs(g)
                                            sign_info.update(word_info)
                                            all_signs.append(sign_info)                                        
                                else:
                                    sign_info = process_signs(sign_data)
                                    sign_info.update(word_info)
                                    all_signs.append(sign_info)
                        '''
                        if c.get('node','') == 'c': #This is the label for a phrase. This seems to no longer be used 
                            for d in c['cdl']:
                                if d.get('node','') == 'l':
                                    form = d['f']['form']
                                    for sign_data in d['f']['gdl']:
                                        if sign_data.get('det','') == 'semantic':
                                            for sd in sign_data['seq']:
                                                sign_info = process_signs(sd)
                                                sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                                all_signs.append(sign_info)
                                        else:
                                            sign_info = process_signs(sign_data)
                                            sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                            all_signs.append(sign_info)
                        '''
                        #types.add(c.get('type','no type'))
                        
print('done')

Could not load: P314095.json
done


In [5]:
df = pd.DataFrame(all_signs)
df = df.fillna('')
df

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,line_label,m,norm,pos,ref,sense,sign_loc_id,text_id,word_sign_tot
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,o 1,,abat,N,P224485.2.1,word,P224485.2.1.0,P224485,2
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,o 1,,abat,N,P224485.2.1,word,P224485.2.1.1,P224485,2
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,o 1,,šarri,N,P224485.2.2,king,P224485.2.2.0,P224485,1
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,o 1,,ana,PRP,P224485.2.3,to,P224485.2.3.0,P224485,2
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,o 1,,ana,PRP,P224485.2.3,to,P224485.2.3.1,P224485,2
5,,1(diš),,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.0,P224485,4
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.1,P224485,4
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.2,P224485,4
8,,MAN,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.3,P224485,4
9,,PAB,damaged,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,o 1,,Aššur-šarru-uṣur,PN,P224485.2.4,1,P224485.2.4.4,P224485,4


In [6]:
file_names = df['file'].unique()
df['sign_form'] = df['b'].apply(lambda x: sign_index.get(x.lower(),'?'))
df['mods_str'] = df['a'] + '.' + df['f']  + '.' + df['m']

import re
def get_num_part(s):
    try:
        n = re.findall(r'[₀₁₂₃₄₅₆₇₈₉]+',s)[0]
        n = n.replace('₀','0').replace('₁','1').replace('₂','2').replace('₃','3').replace('₄','4')
        n = n.replace('₅','5').replace('₆','6').replace('₇','7').replace('₈','8').replace('₉','9')
    except:
        n = 1
    return n
def get_str_part(s):
    try:
        n = re.findall(r'[a-zA-ZšŠṣṢṭṬʾ \(\)0-9]+',s)[0]
    except:
        n = s
    return n
        
df['str_part'] = df['b'].apply(lambda x: get_str_part(x))
df['num_part'] = df['b'].apply(lambda x: get_num_part(x))
df['combined'] = df['sign_form'] + ':' + df['mods_str']
df

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,...,ref,sense,sign_loc_id,text_id,word_sign_tot,sign_form,mods_str,str_part,num_part,combined
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.0,P224485,2,A,..,a,1,A:..
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.1,P224485,2,BAD,..,bat,1,BAD:..
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,...,P224485.2.2,king,P224485.2.2.0,P224485,1,LUGAL,..,LUGAL,1,LUGAL:..
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.0,P224485,2,A,..,a,1,A:..
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.1,P224485,2,,.t.,na,1,NA:.t.
5,,1(diš),,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.0,P224485,4,DIŠ,..,1(diš),1,DIŠ:..
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.1,P224485,4,AŠ,..,aš,1,AŠ:..
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.2,P224485,4,SUR,..,šur,1,SUR:..
8,,MAN,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.3,P224485,4,|U.U|,..,MAN,1,|U.U|:..
9,,PAB,damaged,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.4,P224485,4,PAP,..,PAB,1,PAP:..


We form three count matrices to count appropriate sign forms, syllables, and words

1. Paleography - Sign Form Variants

In [7]:
df2 = df[~(df['mods_str'] == '..')]
list_mod_signs = sorted(list(df2['sign_form'].unique()))

df_modsigns = df[df['sign_form'].isin(list_mod_signs)]
#remove damaged signs too
df_modsigns = df_modsigns[df_modsigns['break'] != 'damaged']
df_modsigns

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,...,ref,sense,sign_loc_id,text_id,word_sign_tot,sign_form,mods_str,str_part,num_part,combined
1,,bat,,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.1,P224485,2,BAD,..,bat,1,BAD:..
2,,LUGAL,,šarru,N,,P224485.json,LUGAL,LUGAL,king,...,P224485.2.2,king,P224485.2.2.0,P224485,1,LUGAL,..,LUGAL,1,LUGAL:..
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.0,P224485,2,A,..,a,1,A:..
4,,na,,ana,PRP,t,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.1,P224485,2,,.t.,na,1,NA:.t.
7,,šur,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.2,P224485,4,SUR,..,šur,1,SUR:..
11,,mu,,šulmu,N,,P224485.json,šul-mu,šul⸣-mu,completeness,...,P224485.2.5,health,P224485.2.5.1,P224485,2,MU,..,mu,1,MU:..
12,,ia,,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.0,P224485,3,|I.A|,..,ia,1,|I.A|:..
14,,ši,,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.2,P224485,3,IGI,..,ši,1,IGI:..
16,,mu,,šulmu,N,,P224485.json,šul-mu,⸢šul⸣-mu,completeness,...,P224485.3.1,health,P224485.3.1.1,P224485,2,MU,..,mu,1,MU:..
17,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.3.2,to,P224485.3.2.0,P224485,2,A,..,a,1,A:..


In [8]:
df_paleo_str = pd.DataFrame(df_modsigns.groupby(['text_id']).apply(lambda x: ' '.join(x['combined'])))
df_paleo_str.columns = ['paleo_str']
df_paleo_str

cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_paleo_str['paleo_str']))
tm_paleo = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_paleo_str.index)
tm_paleo

Unnamed: 0_level_0,A:..,A:.d.,AB@g:..,AB@g:.m.,AB@g:.p.,AB₂:..,AB₂:.d.,AK:..,AK:.d.,AK:.dt.,...,ŠIM:..,ŠIM:.d.,ŠIM:.p.,ŠIM:.t.,ŠU:..,ŠU:.d.,ŠU:.m.,ŠU₂:..,ŠU₂:.d.,ŠU₂:.t.
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P224485,104,0,1,0,0,1,0,1,0,0,...,3,0,0,2,4,0,0,28,0,0
P237089,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
P238649,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P313416,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
P313417,39,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8,0,0,0,0,0
P313419,16,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2,0,0
P313420,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
P313421,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
P313422,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
P313425,47,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,1,0,0,0


2. Orthography - Sign Value Variants

In [9]:
df2 = pd.DataFrame(df.groupby(['str_part'])['num_part'].agg('nunique'))
list_ortho_syls = list(df2[df2[('num_part')] > 1].index)

list_ortho_syls = [h for h in list_ortho_syls if len(re.findall(r'[A-Z]',h)) == 0]
list_ortho_syls

df_ortho_signs = df[df['str_part'].isin(list_ortho_syls)]
df_ortho_signs

Unnamed: 0,a,b,break,cf,epos,f,file,form,frag,gw,...,ref,sense,sign_loc_id,text_id,word_sign_tot,sign_form,mods_str,str_part,num_part,combined
0,,a,damaged,awātu,N,,P224485.json,a-bat,⸢a⸣-bat,word,...,P224485.2.1,word,P224485.2.1.0,P224485,2,A,..,a,1,A:..
3,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.2.3,to,P224485.2.3.0,P224485,2,A,..,a,1,A:..
6,,aš,,Aššur-šarru-uṣur,PN,,P224485.json,{1}aš-šur-MAN-PAB,{1}aš-šur-MAN-⸢PAB,1,...,P224485.2.4,1,P224485.2.4.1,P224485,4,AŠ,..,aš,1,AŠ:..
12,,ia,,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.0,P224485,3,|I.A|,..,ia,1,|I.A|:..
13,,a,damaged,yâšim,IP,,P224485.json,ia-a-ši,ia-⸢a⸣-ši,to me,...,P224485.2.6,me,P224485.2.6.1,P224485,3,A,..,a,1,A:..
17,,a,,ana,PRP,,P224485.json,a-na,a-na\t,to,...,P224485.3.2,to,P224485.3.2.0,P224485,2,A,..,a,1,A:..
20,,aš,,Mat-Aššur,GN,,P224485.json,KUR-aš-šur{KI},KUR-aš-šur{ki},Assyria,...,P224485.3.3,Assyria,P224485.3.3.1,P224485,4,AŠ,..,aš,1,AŠ:..
24,,ka,missing,libbu,N,,P224485.json,ŠA₃-ka,⸢ŠA₃⸣-[ka],interior,...,P224485.3.4,mood,P224485.3.4.1,P224485,2,KA,..,ka,1,KA:..
28,,ka,,ṭābu,AJ,,P224485.json,DUG₃.GA-ka,DUG₃.GA-ka,good,...,P224485.3.6,good,P224485.3.6.2,P224485,2,KA,..,ka,1,KA:..
29,,ša,damaged,ša,REL,,P224485.json,ša,⸢ša⸣,that,...,P224485.4.1,what,P224485.4.1.0,P224485,1,ŠA,..,ša,1,ŠA:..


In [10]:
df_ortho_str = pd.DataFrame(df_ortho_signs.groupby(['text_id']).apply(lambda x: ' '.join(x['b'])))
df_ortho_str.columns = ['ortho_str']
df_ortho_str

cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_ortho_str['ortho_str']))
tm_ortho_sign = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_ortho_str.index)
tm_ortho_sign

Unnamed: 0_level_0,a,ana,ana₃,ar,ar₂,aš,aš₂,a₂,be,be₂,...,ša₂,šu,šum,šum₂,šu₂,ṭe,ṭe₂,ṭe₃,ṭi,ṭi₂
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P224485,103,2,0,6,0,5,8,0,7,2,...,5,4,1,0,33,0,0,1,0,0
P237089,5,0,0,0,0,0,0,0,2,0,...,0,1,0,0,0,0,0,0,0,0
P238649,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,2,0,0
P313416,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
P313417,36,0,0,0,0,1,2,0,0,0,...,0,8,0,0,0,1,0,1,1,0
P313419,16,0,0,0,0,0,0,0,0,0,...,1,0,0,0,2,0,0,1,0,1
P313420,14,1,0,1,0,1,0,0,6,0,...,0,0,0,0,3,0,0,1,0,0
P313421,7,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
P313422,11,1,0,2,0,1,1,0,1,1,...,0,0,0,0,4,0,0,0,0,0
P313425,51,0,0,5,0,0,1,0,5,0,...,3,3,1,1,0,0,0,0,0,0


3. Orthography - Word Transliteration Variants

In [11]:
df_words = pd.DataFrame(all_words)
df_words = df_words[(df_words['cf'] != 'no-cf')]

df_words['lemma'] = df_words['cf'] + '[' + df_words['gw'] + ']' + df_words['pos']
df_words['lemma_norm'] = df_words['lemma'] + ':' + df_words['norm']
df_words['lemma_norm_form'] = df_words['lemma_norm'] + ':' + df_words['form']

df_norm_uniq = pd.DataFrame(df_words.groupby('lemma_norm')['form'].nunique())
list_ortho_words = list(df_norm_uniq[df_norm_uniq[('form')] > 1].index)

df_form_50 = pd.DataFrame(df_words.groupby('lemma_norm_form')['form'].agg('count'))
list_form_50 = list(df_form_50[df_form_50[('form')] > 50].index)

df_ortho_words = df_words[(df_words['lemma_norm'].isin(list_ortho_words)) & df_words['lemma_norm_form'].isin(list_form_50)]
df_ortho_words

Unnamed: 0,cf,epos,file,form,frag,gw,line_label,norm,pos,ref,sense,text_id,word_sign_tot,lemma,lemma_norm,lemma_norm_form
1,šarru,N,P224485.json,LUGAL,LUGAL,king,o 1,šarri,N,P224485.2.2,king,P224485,1,šarru[king]N,šarru[king]N:šarri,šarru[king]N:šarri:LUGAL
2,ana,PRP,P224485.json,a-na,a-na\t,to,o 1,ana,PRP,P224485.2.3,to,P224485,2,ana[to]PRP,ana[to]PRP:ana,ana[to]PRP:ana:a-na
4,šulmu,N,P224485.json,šul-mu,šul⸣-mu,completeness,o 1,šulmu,N,P224485.2.5,health,P224485,2,šulmu[completeness]N,šulmu[completeness]N:šulmu,šulmu[completeness]N:šulmu:šul-mu
6,šulmu,N,P224485.json,šul-mu,⸢šul⸣-mu,completeness,o 2,šulmu,N,P224485.3.1,health,P224485,2,šulmu[completeness]N,šulmu[completeness]N:šulmu,šulmu[completeness]N:šulmu:šul-mu
7,ana,PRP,P224485.json,a-na,a-na\t,to,o 2,ana,PRP,P224485.3.2,to,P224485,2,ana[to]PRP,ana[to]PRP:ana,ana[to]PRP:ana:a-na
10,lū,MOD,P224485.json,lu,⸢lu⸣,may,o 2,lū,MOD,P224485.3.5,may,P224485,1,lū[may]MOD,lū[may]MOD:lū,lū[may]MOD:lū:lu
12,ša,REL,P224485.json,ša,⸢ša⸣,that,o 3,ša,REL,P224485.4.1,what,P224485,1,ša[that]REL,ša[that]REL:ša,ša[that]REL:ša:ša
14,mā,PRP,P224485.json,ma-a,ma-a,saying,o 3,mā,PRP,P224485.4.3,saying,P224485,2,mā[saying]PRP,mā[saying]PRP:mā,mā[saying]PRP:mā:ma-a
16,ša,DET,P224485.json,ša,[ša],of,o 3,ša,DET,P224485.4.5,of,P224485,1,ša[of]DET,ša[of]DET:ša,ša[of]DET:ša:ša
19,ina,PRP,P224485.json,ina,ina,in,o 4,ina,PRP,P224485.5.2,in,P224485,1,ina[in]PRP,ina[in]PRP:ina,ina[in]PRP:ina:ina


In [12]:
df_ortho_wordstr = pd.DataFrame(df_ortho_words.groupby(['text_id']).apply(lambda x: ' '.join(x['lemma_norm_form'])))
df_ortho_wordstr.columns = ['ortho_wordstr']
df_ortho_str

cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_ortho_wordstr['ortho_wordstr']))
tm_ortho_word = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_ortho_wordstr.index)
tm_ortho_word.to_csv('output/tm_ortho_word.csv',encoding='utf-8',sep='\t')
tm_ortho_word

Unnamed: 0_level_0,Urarṭaya[Urarṭian]EN:Urarṭaya:{KUR}URI-a.a,adanniš[very,adi[until]PRP:adi:a-di,akī[as]PRP:akī:a-ki,alāku[go]V:ittalka:it-tal-ka,ammar[as,ana[to]PRP:ana:a-na,annûri[now]AV:annurig:an-nu-rig,anāku[I]IP:anāku:a-na-ku,ardu[slave]N:urdaka:ARAD-ka,...,šumma[if]MOD:šumma:šum₂-ma,šumma[if]MOD:šummu:šum₂-mu,šunu[they]IP:šunu:šu-nu,šū[he]IP:šû:šu-u,šū[he]IP:šû:šu-u₂,ūma[today]AV:ūmâ:u₂-ma-a,ṣābu[people]N:ṣābāni:ERIM-MEŠ,ṣābu[people]N:ṣābāni:{LU₂}ERIM-MEŠ,ṭābu[good]AJ:ṭāb:DUG₃.GA,ṭēmu[(fore)thought]N:ṭēmu:ṭe₃-e-mu
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P224485,0,2,0,0,2,1,11,4,0,0,...,0,0,0,0,1,3,3,0,0,0
P237089,0,0,0,0,0,0,2,0,0,1,...,0,0,0,0,0,0,0,0,1,0
P238649,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
P313416,0,0,0,0,0,0,3,0,0,1,...,0,0,0,0,0,0,0,0,0,0
P313417,0,0,0,0,0,1,7,0,0,1,...,0,0,0,0,0,2,1,0,0,1
P313419,0,0,0,0,0,0,5,1,0,1,...,0,0,0,0,0,0,0,0,0,1
P313420,0,0,1,0,0,0,3,0,0,1,...,0,0,0,0,0,0,1,0,0,1
P313421,0,0,0,0,0,0,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
P313422,0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
P313425,0,1,0,0,0,0,10,0,3,1,...,1,0,1,0,1,2,0,0,0,0


Now I can limit each of the matrices for the feature distributions I want to look at

In [13]:
#Paleography
map_paleo = {'BU':['BU:..','BU:.p.'], 'DI':['DI:..','DI:.d.'], 'LI':['LI:..','LI:.d.'], 'NA':['NA:..','NA:.t.'], 'NI':['NI:..','NI:.d.'], 'RU':['RU:..','RU:.d.'], '|ME.U.U.U|':['|ME.U.U.U|:..','|ME.U.U.U|:.m.'], 'ŠA': ['ŠA:..','ŠA:.dm.']}
list_paleo = []
list_paleo_pairs = []
for v in map_paleo.values():
    list_paleo = list_paleo + v
    list_paleo_pairs.append(v)
list_paleo

#Orthography - Sign
map_ortho_sign = {'ia':['ia','ia₂'], 'li':['li','li₂'], 'ša':['ša','ša₂'], 'šu':['šu','šu₂'], 'u':['u','u₂']}
list_ortho_sign = []
list_ortho_sign_pairs = []
for v in map_ortho_sign.values():
    list_ortho_sign = list_ortho_sign + v
    list_ortho_sign_pairs.append(v)
list_ortho_sign

#Orthography - Word
map_ortho_word = {'bēlu[lord]N:bēlī':['bēlu[lord]N:bēlī:be-li₂','bēlu[lord]N:bēlī:EN'],
                  'bēlu[lord]N:bēlīya':['bēlu[lord]N:bēlīya:EN-ia','bēlu[lord]N:bēlīya:be-li₂-ia'],
                  'lā[not]MOD:lā':['lā[not]MOD:lā:la','lā[not]MOD:lā:la-a'],
                  'lū[may]MOD:lū':['lū[may]MOD:lū:lu','lū[may]MOD:lū:lu-u'],
                  'šulmu[completeness]N:šulmu':['šulmu[completeness]N:šulmu:DI-mu','šulmu[completeness]N:šulmu:šul-mu'],
                  'mā[saying]PRP:mā':['mā[saying]PRP:mā:ma','mā[saying]PRP:mā:ma-a']}
list_ortho_word = []
list_ortho_word_pairs = []
for v in map_ortho_word.values():
    list_ortho_word = list_ortho_word + v
    list_ortho_word_pairs.append(v)
list_ortho_word_pairs
list_features_all = list_paleo + list_ortho_sign + list_ortho_word

Find the general distribution of the pairs

In [14]:
df_form_count = pd.DataFrame(df_words[df_words['lemma_norm_form'].isin(list_ortho_word)].groupby(['lemma_norm','lemma_norm_form'])['cf'].agg('count'))
df_form_count

Unnamed: 0_level_0,Unnamed: 1_level_0,cf
lemma_norm,lemma_norm_form,Unnamed: 2_level_1
bēlu[lord]N:bēlī,bēlu[lord]N:bēlī:EN,182
bēlu[lord]N:bēlī,bēlu[lord]N:bēlī:be-li₂,642
bēlu[lord]N:bēlīya,bēlu[lord]N:bēlīya:EN-ia,770
bēlu[lord]N:bēlīya,bēlu[lord]N:bēlīya:be-li₂-ia,309
lā[not]MOD:lā,lā[not]MOD:lā:la,623
lā[not]MOD:lā,lā[not]MOD:lā:la-a,117
lū[may]MOD:lū,lū[may]MOD:lū:lu,516
lū[may]MOD:lū,lū[may]MOD:lū:lu-u,190
mā[saying]PRP:mā,mā[saying]PRP:mā:ma,61
mā[saying]PRP:mā,mā[saying]PRP:mā:ma-a,1442


In [15]:
dict_form_count = {}
for ln in map_ortho_word:
    count1 = float(df_form_count.loc[(ln,map_ortho_word[ln][0])]['cf'])
    count2 = float(df_form_count.loc[(ln,map_ortho_word[ln][1])]['cf'])
    tot12 = count1 + count2
    dict_form_count[map_ortho_word[ln][0]] = count1 / tot12
    dict_form_count[map_ortho_word[ln][1]] = count2 / tot12
dict_form_count

{'bēlu[lord]N:bēlī:EN': 0.220873786407767,
 'bēlu[lord]N:bēlī:be-li₂': 0.779126213592233,
 'bēlu[lord]N:bēlīya:EN-ia': 0.7136237256719185,
 'bēlu[lord]N:bēlīya:be-li₂-ia': 0.28637627432808155,
 'lā[not]MOD:lā:la': 0.8418918918918918,
 'lā[not]MOD:lā:la-a': 0.1581081081081081,
 'lū[may]MOD:lū:lu': 0.7308781869688386,
 'lū[may]MOD:lū:lu-u': 0.26912181303116145,
 'mā[saying]PRP:mā:ma': 0.04058549567531603,
 'mā[saying]PRP:mā:ma-a': 0.959414504324684,
 'šulmu[completeness]N:šulmu:DI-mu': 0.8677325581395349,
 'šulmu[completeness]N:šulmu:šul-mu': 0.13226744186046513}

In [16]:
df_paleo_count = pd.DataFrame(df[df['combined'].isin(list_paleo)].groupby(['sign_form','combined'])['cf'].agg('count'))
df_paleo_count

Unnamed: 0_level_0,Unnamed: 1_level_0,cf
sign_form,combined,Unnamed: 2_level_1
BU,BU:..,1079
BU,BU:.p.,110
DI,DI:..,1145
DI,DI:.d.,160
LI,LI:..,574
LI,LI:.d.,164
,NA:..,2795
,NA:.t.,483
NI,NI:..,5087
NI,NI:.d.,289


In [17]:
dict_paleo_count = {}
for ln in map_paleo:
    count1 = float(df_paleo_count.loc[(ln,map_paleo[ln][0])]['cf'])
    count2 = float(df_paleo_count.loc[(ln,map_paleo[ln][1])]['cf'])
    tot12 = count1 + count2
    dict_paleo_count[map_paleo[ln][0]] = count1 / tot12
    dict_paleo_count[map_paleo[ln][1]] = count2 / tot12
dict_paleo_count

{'BU:..': 0.9074852817493693,
 'BU:.p.': 0.09251471825063078,
 'DI:..': 0.8773946360153256,
 'DI:.d.': 0.12260536398467432,
 'LI:..': 0.7777777777777778,
 'LI:.d.': 0.2222222222222222,
 'NA:..': 0.8526540573520439,
 'NA:.t.': 0.14734594264795606,
 'NI:..': 0.9462425595238095,
 'NI:.d.': 0.05375744047619048,
 'RU:..': 0.8204334365325078,
 'RU:.d.': 0.17956656346749225,
 '|ME.U.U.U|:..': 0.7247969421882465,
 '|ME.U.U.U|:.m.': 0.27520305781175347,
 'ŠA:..': 0.8141176470588235,
 'ŠA:.dm.': 0.18588235294117647}

In [18]:
df_sign_count = pd.DataFrame(df[df['b'].isin(list_ortho_sign)].groupby(['str_part','b'])['cf'].agg('count'))
df_sign_count

Unnamed: 0_level_0,Unnamed: 1_level_0,cf
str_part,b,Unnamed: 2_level_1
ia,ia,1724
ia,ia₂,424
li,li,701
li,li₂,1191
u,u,1503
u,u₂,1843
ša,ša,2827
ša,ša₂,439
šu,šu,747
šu,šu₂,1725


In [19]:
dict_sign_count = {}
for ln in map_ortho_sign:
    count1 = float(df_sign_count.loc[(ln,map_ortho_sign[ln][0])]['cf'])
    count2 = float(df_sign_count.loc[(ln,map_ortho_sign[ln][1])]['cf'])
    tot12 = count1 + count2
    dict_sign_count[map_ortho_sign[ln][0]] = count1 / tot12
    dict_sign_count[map_ortho_sign[ln][1]] = count2 / tot12
dict_sign_count

{'ia': 0.8026070763500931,
 'ia₂': 0.1973929236499069,
 'li': 0.370507399577167,
 'li₂': 0.629492600422833,
 'u': 0.4491930663478781,
 'u₂': 0.550806933652122,
 'ša': 0.8655848132271893,
 'ša₂': 0.13441518677281078,
 'šu': 0.30218446601941745,
 'šu₂': 0.6978155339805825}

Put it all together

In [20]:
tm_counts = pd.concat([tm_paleo,tm_ortho_sign,tm_ortho_word],axis=1)

#Eliminate NaN's
#tm_counts = tm_counts.fillna(0)

#Smooth the matrix (No longer doing this)
#tm_counts = tm_counts.apply(lambda x: x+1)
tm_counts[list_ortho_word]

Unnamed: 0,bēlu[lord]N:bēlī:be-li₂,bēlu[lord]N:bēlī:EN,bēlu[lord]N:bēlīya:EN-ia,bēlu[lord]N:bēlīya:be-li₂-ia,lā[not]MOD:lā:la,lā[not]MOD:lā:la-a,lū[may]MOD:lū:lu,lū[may]MOD:lū:lu-u,šulmu[completeness]N:šulmu:DI-mu,šulmu[completeness]N:šulmu:šul-mu,mā[saying]PRP:mā:ma,mā[saying]PRP:mā:ma-a
P224485,1.0,0.0,0.0,6.0,6.0,0.0,14.0,2.0,0.0,2.0,0.0,23.0
P237089,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
P238649,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
P313416,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
P313417,0.0,8.0,5.0,0.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,3.0
P313419,0.0,0.0,6.0,0.0,2.0,1.0,1.0,0.0,2.0,0.0,0.0,5.0
P313420,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,4.0
P313421,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
P313422,1.0,1.0,1.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0
P313425,5.0,1.0,4.0,0.0,4.0,1.0,3.0,0.0,3.0,0.0,1.0,5.0


In [21]:
map_all = map_paleo.copy()
map_all.update(map_ortho_sign)
map_all.update(map_ortho_word)
map_all

{'BU': ['BU:..', 'BU:.p.'],
 'DI': ['DI:..', 'DI:.d.'],
 'LI': ['LI:..', 'LI:.d.'],
 'NA': ['NA:..', 'NA:.t.'],
 'NI': ['NI:..', 'NI:.d.'],
 'RU': ['RU:..', 'RU:.d.'],
 'bēlu[lord]N:bēlī': ['bēlu[lord]N:bēlī:be-li₂', 'bēlu[lord]N:bēlī:EN'],
 'bēlu[lord]N:bēlīya': ['bēlu[lord]N:bēlīya:EN-ia',
  'bēlu[lord]N:bēlīya:be-li₂-ia'],
 'ia': ['ia', 'ia₂'],
 'li': ['li', 'li₂'],
 'lā[not]MOD:lā': ['lā[not]MOD:lā:la', 'lā[not]MOD:lā:la-a'],
 'lū[may]MOD:lū': ['lū[may]MOD:lū:lu', 'lū[may]MOD:lū:lu-u'],
 'mā[saying]PRP:mā': ['mā[saying]PRP:mā:ma', 'mā[saying]PRP:mā:ma-a'],
 'u': ['u', 'u₂'],
 '|ME.U.U.U|': ['|ME.U.U.U|:..', '|ME.U.U.U|:.m.'],
 'ŠA': ['ŠA:..', 'ŠA:.dm.'],
 'ša': ['ša', 'ša₂'],
 'šu': ['šu', 'šu₂'],
 'šulmu[completeness]N:šulmu': ['šulmu[completeness]N:šulmu:DI-mu',
  'šulmu[completeness]N:šulmu:šul-mu']}

In [22]:
def p(s):
    powerset = []
    for i in range(2**len(s)):
        subset = [x for j,x in enumerate(s) if (i >> j) & 1]
        powerset.append(subset)
    return powerset

In [23]:
import itertools
list_paleo_power = p(list_paleo_pairs)
list_paleo_power = [list(itertools.chain.from_iterable(x)) for x in list_paleo_power if x != []]

list_ortho_sign_power = p(list_ortho_sign_pairs)
list_ortho_sign_power = [list(itertools.chain.from_iterable(x)) for x in list_ortho_sign_power if x != []]

list_ortho_word_power = p(list_ortho_word_pairs)
list_ortho_word_power = [list(itertools.chain.from_iterable(x)) for x in list_ortho_word_power if x != []]
len(list_ortho_sign_power)

31

In [24]:
list_1pair = []
for m in map_all:
    list_1pair.append(map_all[m])
list_1pair

list_2pair = []
for i in range(len(list_1pair)):
    for j in range(len(list_1pair)):
        if j > i:
            list_2pair.append(list_1pair[i] + list_1pair[j])
list_2pair

list_3pair = []
for i in range(len(list_1pair)):
    for j in range(len(list_1pair)):
        for k in range(len(list_1pair)):
            if k > j and j > i:
                list_3pair.append(list_1pair[i] + list_1pair[j] + list_1pair[k])
                
#put them all together
list_allpairs = list_1pair + list_2pair + list_3pair + [list_paleo] + [list_ortho_sign] + [list_ortho_word] + [list_paleo+list_ortho_sign+list_ortho_word]

In [25]:
def form_dist_matrix(tm_count,mapping):
    d = {}
    vecs = {}
    
    listing = []
    for v in mapping.values():
        listing = listing + v
        
    tm_count = tm_count[listing]
    for i, row in tm_count.iterrows():
        d[i] = {}
        for key in mapping:
            key_sum = np.sum(tm_count.loc[i][mapping[key]])
            for v in mapping[key]:
                if key_sum > 0:
                    d[i][v] = tm_count.loc[i][v] / key_sum
                else:
                    d[i][v] = np.nan

    tm_dist = pd.DataFrame(d).transpose()
    return tm_dist

In [26]:
tm_dist_all = form_dist_matrix(tm_counts,map_all)
tm_dist_all

Unnamed: 0,BU:..,BU:.p.,DI:..,DI:.d.,LI:..,LI:.d.,NA:..,NA:.t.,NI:..,NI:.d.,...,|ME.U.U.U|:..,|ME.U.U.U|:.m.,ŠA:..,ŠA:.dm.,ša,ša₂,šu,šulmu[completeness]N:šulmu:DI-mu,šulmu[completeness]N:šulmu:šul-mu,šu₂
P224485,1.000000,0.000000,0.666667,0.333333,1.000000,0.000000,0.526316,0.473684,0.977273,0.022727,...,0.315789,0.684211,1.000000,0.000000,0.807692,0.192308,0.108108,0.0,1.0,0.891892
P237089,,,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,,,1.000000,0.000000,1.000000,0.000000,1.000000,1.0,0.0,0.000000
P238649,1.000000,0.000000,1.000000,0.000000,0.500000,0.500000,1.000000,0.000000,0.600000,0.400000,...,,,1.000000,0.000000,1.000000,0.000000,,,,
P313416,,,1.000000,0.000000,,,1.000000,0.000000,0.500000,0.500000,...,0.000000,1.000000,,,,,,0.0,1.0,
P313417,,,0.833333,0.166667,1.000000,0.000000,0.357143,0.642857,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,1.0,0.0,0.000000
P313419,1.000000,0.000000,1.000000,0.000000,,,0.833333,0.166667,0.800000,0.200000,...,,,1.000000,0.000000,0.750000,0.250000,0.000000,1.0,0.0,1.000000
P313420,,,1.000000,0.000000,,,1.000000,0.000000,1.000000,0.000000,...,0.000000,1.000000,1.000000,0.000000,1.000000,0.000000,0.000000,1.0,0.0,1.000000
P313421,1.000000,0.000000,1.000000,0.000000,,,1.000000,0.000000,1.000000,0.000000,...,0.000000,1.000000,,,1.000000,0.000000,0.000000,1.0,0.0,1.000000
P313422,1.000000,0.000000,1.000000,0.000000,,,0.000000,1.000000,1.000000,0.000000,...,,,,,1.000000,0.000000,0.000000,,,1.000000
P313425,1.000000,0.000000,0.571429,0.428571,0.333333,0.666667,0.769231,0.230769,1.000000,0.000000,...,0.250000,0.750000,0.833333,0.166667,0.823529,0.176471,1.000000,1.0,0.0,0.000000


In [27]:
dict_count_all = dict_paleo_count.copy()
dict_count_all.update(dict_sign_count)
dict_count_all.update(dict_form_count)
for lnf in dict_count_all:
    #print(dict_form_count[lnf])
    tm_dist_all[lnf] = tm_dist_all[lnf].fillna(dict_count_all[lnf])
tm_dist_all

Unnamed: 0,BU:..,BU:.p.,DI:..,DI:.d.,LI:..,LI:.d.,NA:..,NA:.t.,NI:..,NI:.d.,...,|ME.U.U.U|:..,|ME.U.U.U|:.m.,ŠA:..,ŠA:.dm.,ša,ša₂,šu,šulmu[completeness]N:šulmu:DI-mu,šulmu[completeness]N:šulmu:šul-mu,šu₂
P224485,1.000000,0.000000,0.666667,0.333333,1.000000,0.000000,0.526316,0.473684,0.977273,0.022727,...,0.315789,0.684211,1.000000,0.000000,0.807692,0.192308,0.108108,0.000000,1.000000,0.891892
P237089,0.907485,0.092515,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,0.724797,0.275203,1.000000,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000,0.000000
P238649,1.000000,0.000000,1.000000,0.000000,0.500000,0.500000,1.000000,0.000000,0.600000,0.400000,...,0.724797,0.275203,1.000000,0.000000,1.000000,0.000000,0.302184,0.867733,0.132267,0.697816
P313416,0.907485,0.092515,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,0.500000,0.500000,...,0.000000,1.000000,0.814118,0.185882,0.865585,0.134415,0.302184,0.000000,1.000000,0.697816
P313417,0.907485,0.092515,0.833333,0.166667,1.000000,0.000000,0.357143,0.642857,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000,0.000000
P313419,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,0.833333,0.166667,0.800000,0.200000,...,0.724797,0.275203,1.000000,0.000000,0.750000,0.250000,0.000000,1.000000,0.000000,1.000000
P313420,0.907485,0.092515,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.000000,1.000000,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000
P313421,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.000000,1.000000,0.814118,0.185882,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000
P313422,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,0.000000,1.000000,1.000000,0.000000,...,0.724797,0.275203,0.814118,0.185882,1.000000,0.000000,0.000000,0.867733,0.132267,1.000000
P313425,1.000000,0.000000,0.571429,0.428571,0.333333,0.666667,0.769231,0.230769,1.000000,0.000000,...,0.250000,0.750000,0.833333,0.166667,0.823529,0.176471,1.000000,1.000000,0.000000,0.000000


The Fun Begins

In [None]:
def calculate_scores(tm_dist,feature_list,flist_str):
    list_scores = []
    c = 0
    for g in feature_list:
        c += 1
        print(str(c) + '. Working on: ' + str(g))
        diff = 0
        decel = 0
        for k in range(1,30):
            d = {}
            km = KMeans(n_clusters=k,max_iter=1000).fit(tm_dist[g])

            sil_score = np.nan
            if k > 1:
                sil_score = silhouette_score(tm_dist[g],labels=km.labels_)

            if k > 2:
                decel = km.inertia_ - inertia  - diff
            if k > 1:
                diff = km.inertia_ - inertia

            inertia = km.inertia_

            d = {'group': str(g),'group_len': len(g), 'k': k,'sil_score': sil_score,'dist': inertia,'diff':diff,'decel':decel}
            list_scores.append(d)

    df_scores = pd.DataFrame(list_scores)
    df_scores.to_csv('output/scores_' + flist_str + '.csv',encoding='utf-8',sep='\t')
    return df_scores

In [None]:
def export_plots(df_scores,feature_list,flist_str):
    c = 0
    for g in feature_list:
        c += 1
        print(str(c) + '. Plotting: ' + str(g))
        df_temp = df_scores[df_scores['group'] == str(g)]
        K = list(df_temp['k'])

        plt.figure(figsize=(12, 12), dpi=120, facecolor='w', edgecolor='k')

        plt.plot(K,df_temp['dist'])
        plt.xticks(K)
        plt.xlabel('k')
        plt.ylabel('Distortion')
        plt.title('Elbow Method: ' + str(g))
        plt.savefig('output/plots/elbow_' + flist_str + '_reg_' + str(c) + '.png')

        plt.clf()

In [None]:
df_scores = calculate_scores(tm_dist_all,list_ortho_word_power,'ortho_word')

In [None]:
export_plots(df_scores,list_ortho_word_power,'ortho_word')

In [None]:
df_scores = calculate_scores(tm_dist_all,list_ortho_sign_power,'ortho_sign')

In [None]:
export_plots(df_scores,list_ortho_sign_power,'ortho_sign')

In [None]:
df_scores = calculate_scores(tm_dist_all,list_paleo_power,'paleo')

In [None]:
export_plots(df_scores,list_paleo_power,'paleo')

Maximum Silhouette Score by Group

In [None]:
df_scores[df_scores['sil_score'].isin(list(df_scores.groupby(['group']).agg({'sil_score':'max'})['sil_score']))].sort_values(by='sil_score',ascending=False)

In [None]:
df_scores[df_scores['decel'].isin(df_scores.groupby('group')['decel'].max())]

Let's apply the evaluations now

In [28]:
cat_file = codecs.open('sargonletters/catalogue.json','r','utf-8')
cat_json = json.load(cat_file)

class_l = []
class_index = []
for pnum in cat_json['members']:
    id_text        = cat_json['members'][pnum].get('id_text','')
    designation    = cat_json['members'][pnum].get('designation','')
    ancient_author = cat_json['members'][pnum].get('ancient_author','')
    dossier        = cat_json['members'][pnum].get('dossier','')
    dossier_nocertain = '.'.join(dossier.split('.')[0:3])
    saa_chap       = '.'.join(dossier.split('.')[0:2])
    senderloc     = cat_json['members'][pnum].get('senderloc','')
    class_d = {'designation': designation,'ancient_author':ancient_author,'dossier':dossier,'dossier_nocertain': dossier_nocertain, 'senderloc':senderloc,'saa_chap':saa_chap}
    class_index.append(id_text)
    class_l.append(class_d)
    
df_class = pd.DataFrame(class_l,index=class_index)
df_class

Unnamed: 0,ancient_author,designation,dossier,dossier_nocertain,saa_chap,senderloc
P224485,Sargon II,SAA 01 001,SAA01.01.01.a,SAA01.01.01,SAA01.01,Royal Court
P237089,Issar-duri,SAA 15 014,SAA15.01.01.a,SAA15.01.01,SAA15.01,Arrapha
P238649,(unknown),SAA 15 368,SAA15.09.07.c,SAA15.09.07,SAA15.09,uncertain
P313416,Sin-ašared,SAA 01 158,SAA01.07.15.c,SAA01.07.15,SAA01.07,Assyria
P313417,Mannu-ki-Aššur-le’i,SAA 01 233,SAA01.13.01.c,SAA01.13.01,SAA01.13,Guzana
P313419,[...]-ka’’in,SAA 05 040,SAA05.02.03.a,SAA05.02.03,SAA05.02,Tušhan
P313420,Il-yada’,SAA 15 164,SAA15.06.01.a,SAA15.06.01,SAA15.06,Dur-Kurigalzu
P313421,Šarru-emuranni,SAA 15 237,SAA15.07.01.a,SAA15.07.01,SAA15.07,Babylon
P313422,Gabbu-ana-Aššur,SAA 05 114,SAA05.07.01.c,SAA05.07.01,SAA05.07,Kurbail
P313425,Bel-liqbi,SAA 01 179,SAA01.08.03.c,SAA01.08.03,SAA01.08,Zobah


By Sender Location

In [29]:
df_senderloc_certain = df_class[(df_class['senderloc'] != 'uncertain')]

list_senderloc_certain = list(df_senderloc_certain.index)

senderloc_tot = len(df_senderloc_certain['senderloc'].unique())
len(list_senderloc_certain)

851

In [30]:
tm_senderloc_certain = tm_dist_all.loc[list(set(list_senderloc_certain) & set(list(tm_dist_all.index)))]
tm_senderloc_certain

Unnamed: 0,BU:..,BU:.p.,DI:..,DI:.d.,LI:..,LI:.d.,NA:..,NA:.t.,NI:..,NI:.d.,...,|ME.U.U.U|:..,|ME.U.U.U|:.m.,ŠA:..,ŠA:.dm.,ša,ša₂,šu,šulmu[completeness]N:šulmu:DI-mu,šulmu[completeness]N:šulmu:šul-mu,šu₂
P334443,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.302184,1.000000,0.000000,0.697816
P313940,1.000000,0.000000,0.877395,0.122605,1.000000,0.000000,1.000000,0.000000,0.500000,0.500000,...,0.724797,0.275203,0.814118,0.185882,1.000000,0.000000,0.000000,0.867733,0.132267,1.000000
P314039,0.907485,0.092515,0.877395,0.122605,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.724797,0.275203,0.814118,0.185882,0.865585,0.134415,0.000000,0.867733,0.132267,1.000000
P313462,1.000000,0.000000,0.000000,1.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.928571,0.071429,0.647059,0.352941,1.000000,0.000000,0.250000,0.000000,1.000000,0.750000
P313480,0.000000,1.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,0.750000,0.250000,...,0.600000,0.400000,0.500000,0.500000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000
P334077,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,0.842105,0.157895,0.384615,1.000000,0.000000,0.615385
P313871,0.907485,0.092515,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,0.814118,0.185882,0.500000,0.500000,0.000000,0.867733,0.132267,1.000000
P334071,1.000000,0.000000,0.877395,0.122605,0.777778,0.222222,0.666667,0.333333,1.000000,0.000000,...,0.724797,0.275203,0.814118,0.185882,1.000000,0.000000,0.666667,0.867733,0.132267,0.333333
P313788,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.000000,1.000000,1.000000,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000,0.000000
P313702,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.724797,0.275203,1.000000,0.000000,0.800000,0.200000,0.302184,0.867733,0.132267,0.697816


In [81]:
from random import shuffle

In [85]:
l = [1,2,2,2,3,3,1,2,3]
shuffle(l)
l

[3, 2, 3, 2, 1, 2, 1, 2, 3]

In [69]:
def calculate_purity_score(dict_class,n):
    purity_score = 0
    for s in dict_class:
        cnt = Counter(dict_class[s])
        purity_score += cnt.most_common()[0][1]

    purity_score = purity_score / n
    return purity_score    

In [97]:
def evaluate_clusters(tm_class,feature_list,flist_str,class_type,K):
    list_evals = []
    c = 0
    for g in feature_list:
        c += 1
        #print(str(c) + '. Evaluating: ' + str(g))
        km = KMeans(n_clusters=K,max_iter=1000).fit(tm_class[g])
        labels_random = km.labels_.copy()
        shuffle(labels_random)

        clustered = {}
        clustered_random = {}
        for i in range(len(km.labels_)):
            if km.labels_[i] in clustered:
                clustered[km.labels_[i]].append(df_class.loc[tm_class.index[i]][class_type])
            else:
                clustered[km.labels_[i]] = [df_class.loc[tm_class.index[i]][class_type]]
                
            if labels_random[i] in clustered_random:
                clustered_random[labels_random[i]].append(df_class.loc[tm_class.index[i]][class_type])
            else:
                clustered_random[labels_random[i]] = [df_class.loc[tm_class.index[i]][class_type]]
        
                
        #purity score
        purity_score_class = calculate_purity_score(clustered,len(tm_class.index))
        purity_score_random = calculate_purity_score(clustered_random,len(tm_class.index))
        
        d = {'group':g,
             'purity_class': purity_score_class,
             'purity_random': purity_score_random,
             'group_len': len(g),
             'class_type': class_type,
             'feature_type': flist_str,
             'num_clusters': K}
        list_evals.append(d)

    df_evals = pd.DataFrame(list_evals)
    #Error columns
    df_evals['error_sub'] = df_evals['purity_class'] - df_evals['purity_random']
    df_evals['error_div'] = df_evals['purity_class'] / df_evals['purity_random']
    df_evals.to_csv('output/evaluation_' + class_type + '_' + flist_str + '.csv',encoding='utf-8')
    return df_evals

In [71]:
df_evals = evaluate_clusters(tm_senderloc_certain,list_ortho_word_power,'ortho_word','senderloc',senderloc_tot)
df_evals

Unnamed: 0,group,group_len,purity_class,purity_random
0,"[bēlu[lord]N:bēlī:be-li₂, bēlu[lord]N:bēlī:EN]",2,0.129412,0.041176
1,"[bēlu[lord]N:bēlīya:EN-ia, bēlu[lord]N:bēlīya:...",2,0.128235,0.045882
2,"[bēlu[lord]N:bēlī:be-li₂, bēlu[lord]N:bēlī:EN,...",4,0.168235,0.076471
3,"[lā[not]MOD:lā:la, lā[not]MOD:lā:la-a]",2,0.130588,0.043529
4,"[bēlu[lord]N:bēlī:be-li₂, bēlu[lord]N:bēlī:EN,...",4,0.163529,0.077647
5,"[bēlu[lord]N:bēlīya:EN-ia, bēlu[lord]N:bēlīya:...",4,0.176471,0.090588
6,"[bēlu[lord]N:bēlī:be-li₂, bēlu[lord]N:bēlī:EN,...",6,0.215294,0.114118
7,"[lū[may]MOD:lū:lu, lū[may]MOD:lū:lu-u]",2,0.130588,0.043529
8,"[bēlu[lord]N:bēlī:be-li₂, bēlu[lord]N:bēlī:EN,...",4,0.163529,0.070588
9,"[bēlu[lord]N:bēlīya:EN-ia, bēlu[lord]N:bēlīya:...",4,0.177647,0.087059


By Dossier

In [73]:
df_dossier_certain_a = df_class[df_class['dossier'].str.contains('.a')]
df_dossier_certain_b = df_class[df_class['dossier'].str.contains('.b')]
df_dossier_certain_c = df_class[df_class['dossier'].str.contains('.c')]

list_dossier_certain_a = list(df_dossier_certain_a.index)
list_dossier_certain_b = list(df_dossier_certain_b.index)
list_dossier_certain_c = list(df_dossier_certain_c.index)

dossier_certain_a_tot = len(df_dossier_certain_a['dossier'].unique())
dossier_certain_b_tot = len(df_dossier_certain_b['dossier'].unique())
dossier_certain_c_tot = len(df_dossier_certain_c['dossier'].unique())

print(str(dossier_certain_a_tot) + ' unique dossiers in ' + str(len(list_dossier_certain_a)) + ' texts')
print(str(dossier_certain_b_tot) + ' unique dossiers in ' + str(len(list_dossier_certain_b)) + ' texts')
print(str(dossier_certain_c_tot) + ' unique dossiers in ' + str(len(list_dossier_certain_c)) + ' texts')
print(str(len(df_class['dossier_nocertain'].unique())))

106 unique dossiers in 276 texts
44 unique dossiers in 69 texts
93 unique dossiers in 609 texts
161


In [74]:
tm_dossier = tm_dist_all.loc[list(set(df_class.index) & set(list(tm_dist_all.index)))]
tm_dossier

Unnamed: 0,BU:..,BU:.p.,DI:..,DI:.d.,LI:..,LI:.d.,NA:..,NA:.t.,NI:..,NI:.d.,...,|ME.U.U.U|:..,|ME.U.U.U|:.m.,ŠA:..,ŠA:.dm.,ša,ša₂,šu,šulmu[completeness]N:šulmu:DI-mu,šulmu[completeness]N:šulmu:šul-mu,šu₂
P334443,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.302184,1.000000,0.000000,0.697816
P313940,1.000000,0.000000,0.877395,0.122605,1.000000,0.000000,1.000000,0.000000,0.500000,0.500000,...,0.724797,0.275203,0.814118,0.185882,1.000000,0.000000,0.000000,0.867733,0.132267,1.000000
P314039,0.907485,0.092515,0.877395,0.122605,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.724797,0.275203,0.814118,0.185882,0.865585,0.134415,0.000000,0.867733,0.132267,1.000000
P313462,1.000000,0.000000,0.000000,1.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.928571,0.071429,0.647059,0.352941,1.000000,0.000000,0.250000,0.000000,1.000000,0.750000
P336773,0.907485,0.092515,1.000000,0.000000,0.777778,0.222222,0.852654,0.147346,0.946243,0.053757,...,0.724797,0.275203,0.814118,0.185882,0.865585,0.134415,0.302184,0.867733,0.132267,0.697816
P313844,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.852654,0.147346,0.400000,0.600000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.302184,0.867733,0.132267,0.697816
P313480,0.000000,1.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,0.750000,0.250000,...,0.600000,0.400000,0.500000,0.500000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000
P313887,0.907485,0.092515,0.877395,0.122605,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.333333,0.666667,0.814118,0.185882,1.000000,0.000000,1.000000,0.867733,0.132267,0.000000
P334077,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,0.842105,0.157895,0.384615,1.000000,0.000000,0.615385
P313871,0.907485,0.092515,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,0.814118,0.185882,0.500000,0.500000,0.000000,0.867733,0.132267,1.000000


In [75]:
df_evals = evaluate_clusters(tm_dossier,list_ortho_word_power,'ortho_word','dossier',len(df_class['dossier_nocertain'].unique()))

KeyboardInterrupt: 

By Author

In [76]:
df_author_certain = df_class[(df_class['ancient_author'] != '(unknown)')]

list_author_certain = list(df_author_certain.index)

author_list = df_author_certain['ancient_author'].unique()
author_tot = len(author_list)
print(str(author_tot) + ' total authors in ' + str(len(list_author_certain)))

125 total authors in 656


By SAA Chapter

In [77]:
saachap_tot = len(df_class['saa_chap'].unique())
saachap_tot

tm_saachap = tm_dist_all.loc[list(set(df_class.index) & set(list(tm_dist_all.index)))]
tm_saachap

Unnamed: 0,BU:..,BU:.p.,DI:..,DI:.d.,LI:..,LI:.d.,NA:..,NA:.t.,NI:..,NI:.d.,...,|ME.U.U.U|:..,|ME.U.U.U|:.m.,ŠA:..,ŠA:.dm.,ša,ša₂,šu,šulmu[completeness]N:šulmu:DI-mu,šulmu[completeness]N:šulmu:šul-mu,šu₂
P334443,1.000000,0.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.302184,1.000000,0.000000,0.697816
P313940,1.000000,0.000000,0.877395,0.122605,1.000000,0.000000,1.000000,0.000000,0.500000,0.500000,...,0.724797,0.275203,0.814118,0.185882,1.000000,0.000000,0.000000,0.867733,0.132267,1.000000
P314039,0.907485,0.092515,0.877395,0.122605,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.724797,0.275203,0.814118,0.185882,0.865585,0.134415,0.000000,0.867733,0.132267,1.000000
P313462,1.000000,0.000000,0.000000,1.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.928571,0.071429,0.647059,0.352941,1.000000,0.000000,0.250000,0.000000,1.000000,0.750000
P336773,0.907485,0.092515,1.000000,0.000000,0.777778,0.222222,0.852654,0.147346,0.946243,0.053757,...,0.724797,0.275203,0.814118,0.185882,0.865585,0.134415,0.302184,0.867733,0.132267,0.697816
P313844,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.852654,0.147346,0.400000,0.600000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.302184,0.867733,0.132267,0.697816
P313480,0.000000,1.000000,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,0.750000,0.250000,...,0.600000,0.400000,0.500000,0.500000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000
P313887,0.907485,0.092515,0.877395,0.122605,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,0.333333,0.666667,0.814118,0.185882,1.000000,0.000000,1.000000,0.867733,0.132267,0.000000
P334077,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,0.842105,0.157895,0.384615,1.000000,0.000000,0.615385
P313871,0.907485,0.092515,1.000000,0.000000,0.777778,0.222222,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,0.814118,0.185882,0.500000,0.500000,0.000000,0.867733,0.132267,1.000000


In [None]:
df_evals = evaluate_clusters(tm_saachap,list_ortho_word_power,'ortho_word','saa_chap',saachap_tot)

In [98]:
#All Together
pd.options.display.max_colwidth = 200

df_evals_sender_ortho_word = evaluate_clusters(tm_senderloc_certain,list_ortho_word_power,'ortho_word','senderloc',senderloc_tot)
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

df_evals_dossier_ortho_word = evaluate_clusters(tm_dossier,list_ortho_word_power,'ortho_word','dossier',len(df_class['dossier_nocertain'].unique()))
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

df_evals_saachap_ortho_word = evaluate_clusters(tm_saachap,list_ortho_word_power,'ortho_word','saa_chap',saachap_tot)
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

In [99]:
df_evals_sender_ortho_sign = evaluate_clusters(tm_senderloc_certain,list_ortho_sign_power,'ortho_sign','senderloc',senderloc_tot)
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

df_evals_dossier_ortho_sign = evaluate_clusters(tm_dossier,list_ortho_sign_power,'ortho_sign','dossier',len(df_class['dossier_nocertain'].unique()))
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

df_evals_saachap_ortho_sign = evaluate_clusters(tm_saachap,list_ortho_sign_power,'ortho_sign','saa_chap',saachap_tot)
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

In [100]:
df_evals_sender_paleo = evaluate_clusters(tm_senderloc_certain,list_paleo_power,'paleo','senderloc',senderloc_tot)
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

df_evals_dossier_paleo = evaluate_clusters(tm_dossier,list_paleo_power,'paleo','dossier',len(df_class['dossier_nocertain'].unique()))
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

df_evals_saachap_paleo = evaluate_clusters(tm_saachap,list_paleo_power,'paleo','saa_chap',saachap_tot)
#df_eval_maxrow = df_evals[df_evals['purity'] == df_evals['purity'].max()]

In [101]:
df_evals_all = pd.concat([df_evals_sender_ortho_word,
                          df_evals_sender_ortho_sign,
                         df_evals_sender_paleo,
                          df_evals_dossier_ortho_word,
                          df_evals_dossier_ortho_sign,
                          df_evals_dossier_paleo,
                         df_evals_saachap_ortho_word,
                         df_evals_saachap_ortho_sign,
                         df_evals_saachap_paleo])
df_evals_all.to_csv('output/evaluation_all.csv',encoding='utf-8')

---Testing plene writings in general---

In [None]:
import re
df_plene = df_words[(df_words['form'].str.contains(r'([aeiu])[₁₂₃₄₅₆₇₈₉₀]?\-\1[₁₂₃₄₅₆₇₈₉₀]?($|\-)')) | (df_words['form'].str.contains(r'(^|\-)([aeiu])[₁₂₃₄₅₆₇₈₉₀]?\-\2[₁₂₃₄₅₆₇₈₉₀]?'))]
plene_list = list(df_plene['lemma_norm_form'].unique())
df_plene_count = pd.DataFrame(df_plene.groupby('lemma_norm_form')['form'].agg('count'))
df_plene_count = df_plene_count.sort_values(by=('form'),ascending=False)

list_plene_50 = list(df_plene_count[df_plene_count['form'] > 50].index)
list_plene_50
df_plene_50 = df_words[df_words['lemma_norm_form'].isin(list_plene_50)]

list_nonplene_50 = ['mā[saying]PRP:mā:ma',
 'kī[like]PRP:kî:ki',
 'ūma[today]AV:ūmâ:u₂-ma',
 'lū[may]MOD:lū:lu',
 'lā[not]MOD:lā:la',
 'atā[why?]QP:atâ:a-ta',
 'ṭēmu[(fore)thought]N:ṭēmu:ṭe₃-mu',
 #'šū[he]IP:šû:šu',
 'mīnu[what?]QP:mīnu:mi-nu']
df_nonplene_50 = df_words[df_words['lemma_norm_form'].isin(list_nonplene_50)]

list_plene_extra = ['mīnu[what?]QP:mīnu:mi₃-i-nu',
 'mīnu[what?]QP:mīnu:mi₃-nu',
 'mīnu[what?]QP:mīni:mi-i-ni',
 'mīnu[what?]QP:mīni:mi-ni',
 'šū[he]IP:šû:šu₂-u',
 'šū[he]IP:šû:šu₂-u₂']

list_plene_all = list_plene_50 + list_nonplene_50 + list_plene_extra
list_plene_all

df_plene_all = df_words[df_words['lemma_norm_form'].isin(list_plene_all)]
df_plene_all

In [None]:
df_plene_all['ancient_author'] = df_plene_all['text_id'].apply(lambda x: df_class.loc[x]['ancient_author'])
df_plene_all
df_plene_wordstr = pd.DataFrame(df_plene_all.groupby(['ancient_author']).apply(lambda x: ' '.join(x['lemma_norm_form'])))
df_plene_wordstr.columns = ['plene_wordstr']
df_plene_wordstr

cv = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
ft = cv.fit_transform(list(df_plene_wordstr['plene_wordstr']))
tm_plene_author = pd.DataFrame(ft.toarray(),columns=cv.get_feature_names(),index=df_plene_wordstr.index)
tm_plene_author.to_csv('output/tm_plene_author.csv',encoding='utf-8')
tm_plene_author

---Test Other---

In [None]:
tm_counts_class = pd.concat([tm_counts[list_features_all],df_class],axis=1)
tm_counts_class.to_csv('output/tm_counts.csv',encoding='utf-8')
tm_counts_class

In [None]:
tm_counts_class_senderloc = pd.DataFrame(tm_counts_class.groupby('senderloc').agg('sum'))[list_features_all]
tm_counts_class_senderloc.to_csv('output/tm_counts_senderloc.csv',encoding='utf-8')
tm_counts_class_senderloc

In [None]:
tm_counts_class_dossier = pd.DataFrame(tm_counts_class.groupby('dossier_nocertain').agg('sum'))[list_features_all]
tm_counts_class_dossier.to_csv('output/tm_counts_dossier.csv',encoding='utf-8')
tm_counts_class_dossier

In [None]:
tm_counts_class_saachap = pd.DataFrame(tm_counts_class.groupby('saa_chap').agg('sum'))[list_features_all]
tm_counts_class_saachap.to_csv('output/tm_counts_saachap.csv',encoding='utf-8')
tm_counts_class_saachap

In [None]:
pd.concat([tm_ortho_word[list_ortho_word],df_class],axis=1).to_csv('test.csv',encoding='utf-8')

In [None]:
p_maa0 = tm_ortho_word[(tm_ortho_word['mā[saying]PRP:mā:ma-a'] > 0) & (tm_ortho_word['mā[saying]PRP:mā:ma'] == 0)].index
p_la0 = tm_ortho_word[(tm_ortho_word['lā[not]MOD:lā:la'] > 0) & (tm_ortho_word['lā[not]MOD:lā:la-a'] == 0)].index
p_lu0 = tm_ortho_word[(tm_ortho_word['lū[may]MOD:lū:lu'] > 0) & (tm_ortho_word['lū[may]MOD:lū:lu-u'] == 0)].index

p_maa1 = tm_ortho_word[(tm_ortho_word['mā[saying]PRP:mā:ma-a'] > 0) & (tm_ortho_word['mā[saying]PRP:mā:ma'] <= 1)].index
p_la1 = tm_ortho_word[(tm_ortho_word['lā[not]MOD:lā:la'] > 0) & (tm_ortho_word['lā[not]MOD:lā:la-a'] <= 1)].index
p_lu1 = tm_ortho_word[(tm_ortho_word['lū[may]MOD:lū:lu'] > 0) & (tm_ortho_word['lū[may]MOD:lū:lu'] <= 1)].index

p_ma = tm_ortho_word[(tm_ortho_word['mā[saying]PRP:mā:ma'] > 0)].index
p_laa = tm_ortho_word[(tm_ortho_word['lā[not]MOD:lā:la-a'] > 0)].index
p_luu = tm_ortho_word[(tm_ortho_word['lū[may]MOD:lū:lu-u'] > 0)].index

p_maa0_la0 = list(set(p_maa0) & set(p_la0))
p_maa0_lu0 = list(set(p_maa0) & set(p_lu0))
p_la0_lu0 = list(set(p_la0) & set(p_lu0))

p_maa1_la1 = list(set(p_maa1) & set(p_la1))
p_maa1_lu1 = list(set(p_maa1) & set(p_lu1))
p_la1_lu1 = list(set(p_la1) & set(p_lu1))

p_ma_laa = list(set(p_ma) & set(p_laa))
p_ma_luu = list(set(p_ma) & set(p_luu))
p_laa_luu = list(set(p_laa) & set(p_luu))

print(str(len(p_maa0_la0) / len(p_la0)))
print(str(len(p_maa0_la0) / len(p_maa0)))
print(str(len(p_maa0_lu0) / len(p_lu0)))
print(str(len(p_maa0_lu0) / len(p_maa0)))
print(str(len(p_la0_lu0) / len(p_lu0)))
print(str(len(p_la0_lu0) / len(p_la0)))

print(str(len(p_maa1_la1) / len(p_la1)))
print(str(len(p_maa1_la1) / len(p_maa1)))
print(str(len(p_maa1_lu1) / len(p_lu1)))
print(str(len(p_maa1_lu1) / len(p_maa1)))
print(str(len(p_la1_lu1) / len(p_lu1)))
print(str(len(p_la1_lu1) / len(p_la1)))

print(str(len(p_ma_laa) / len(p_laa)))
print(str(len(p_ma_laa) / len(p_ma)))
print(str(len(p_ma_luu) / len(p_luu)))
print(str(len(p_ma_luu) / len(p_ma)))
print(str(len(p_laa_luu) / len(p_luu)))
print(str(len(p_laa_luu) / len(p_laa)))