In [None]:
import json
import codecs
import os
import pandas as pd

<h2>0. Gathering the Data</h2>
<p>In this preliminary section, we will gather all of the cuneiform sign transliterations from the JSON files in our dataset. Then we will consolidate them into a data frame and match each sign value with its sign name</p>

(Do we need to discuss the basics of cuneiform transliteration or is it assumed that our audience is familiar with it?)

<h3>0.1: OGSL</h3>
<p>Now, we will load a map from sign value to sign name to use on the signs in our texts. The OGSL is... (website...)</p>

In [None]:
file_ogsl = codecs.open('ogsl-sl.json','r','utf-8')
ogsl = json.load(file_ogsl)
sign_index = ogsl['index']
sign_index

<h3>0.2: Collect the Text Signs</h3>
<p>The following code parses the JSON files of the ORACC texts and collects each sign transliteration. Since different signs have different types of reading, they are rendered differently in the JSON file and we must take care to recognize each sign reading type in the JSON file</p>
The types of signs and their representation in the JSON Files:
<ol>
    <li>Syllable - The reading of a sign as a syllable is rendered with a 'v' key</li>
    <li>Logogram - The reading of a sign as a logogram, i.e. one represents a word in itself or as part of a complex of signs that represents a single word is written in capital letters and with a 's' key</li>
    <li>Numerical - A sign representing a number (or personal name determinative) has an extra key called 'sexified'. This gives information on the number sign's wedge structure.</li>
</ol>

In addition, a modified sign can be any of the three types above, but written with a nonstandard paleography (e.g. a diagonal wedge is incised in the clay instead of a horizontal). These are the signs we want to examine. They have extra data given under the 'mods' key.

In [None]:
def process_signs(sign_data):
    sign_info = {}
    if 'v' in sign_data:
        sign_info['b'] = sign_data['v']
    if 's' in sign_data:
        sign_info['b'] = sign_data['s']
    if 'n' in sign_data:
        sign_info['b'] = sign_data.get('sexified',sign_data.get('form','noform?'))
    if 'mods' in sign_data:
        for m in sign_data['mods']:
            for d in m:
                sign_info[d] = m[d]
    if 'break' in sign_data:
        sign_info['break'] = sign_data['break']
    sign_info['sign_loc_id'] = sign_data.get('id','no-id')
    return sign_info    

In [None]:
types = set()
all_signs = []
for fname in os.listdir('corpusjson'):
    #if not fname == 'P224485.json':
    #    continue
    f = codecs.open('corpusjson/'+fname,'r','utf-8')
    j = json.load(f)
    #print(str(len(j['cdl'][0]['cdl'])))
    #print(fname)
    for a in j['cdl'][0]['cdl']:
        if a.get('type','') == 'discourse':
            for b in a['cdl']:
                if b.get('type','') == 'sentence':
                    line_label = ''                    
                    for c in b['cdl']:
                        if c.get('node','') == 'd': #This is the label for the line e.g. "o ii 3"
                            line_label = c.get('label','nolabel')
                        if c.get('node','') == 'l':
                            form = c['f']['form']
                            for sign_data in c['f']['gdl']:
                                if sign_data.get('det','') == 'semantic':
                                    for sd in sign_data['seq']:
                                        sign_info = process_signs(sd)
                                        sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                        all_signs.append(sign_info)
                                else:
                                    sign_info = process_signs(sign_data)
                                    sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                    all_signs.append(sign_info)
                        if c.get('node','') == 'c':
                            for d in c['cdl']:
                                if d.get('node','') == 'l':
                                    form = d['f']['form']
                                    for sign_data in d['f']['gdl']:
                                        if sign_data.get('det','') == 'semantic':
                                            for sd in sign_data['seq']:
                                                sign_info = process_signs(sd)
                                                sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                                all_signs.append(sign_info)
                                        else:
                                            sign_info = process_signs(sign_data)
                                            sign_info.update({'file':fname,'line_label':line_label,'form': form})
                                            all_signs.append(sign_info)
                        #types.add(c.get('type','no type'))
                        
all_signs

Now, we form our Data Frame where each row contains information on every sign in the corpus. Further limitations on which signs are significant to our purposes will be made later, but for now we will eliminate all of the signs which are labelled as "missing," (i.e. reconstructed) because any information based on their paleography or orthography cannot be ascertained.

In [None]:
df = pd.DataFrame(all_signs)
df = df.fillna('')
df = df[(df['break'] != 'missing')]
df

<h3>1. Setting Up the Data for Clustering</h3>
<p>The general goal is to assign a vector to each text that reflects the usage of variant orthography and paleography.</p>
<ol>
    <li>Paleography - Any one set of wedges that we classify as a sign can be impressed on the clay in different ways. For example, a wedge can be missing or one can be added. Also, the tilt of a wedge can variable. These are the features we want to examine in order to see if one text prefers one sign writing or another.</li>
    <li>Orthography - Due to the homophony of the cuneiform writing system, one syllable can be written with many signs. For example, 'li' can be written with the LI-sign but also with the NI-sign, in which case it would be transliterated as li<sub>2</sub></li>
</ol>
<p>Other variables can be applied to a text as attributes in its vector. (What are these? We talked about things like Provenence, city information, scribe information. Also, if we apply different types of variables how can we use a clustering algorithm to treat these vector components as a different entity?).</p>
<p>This section therefore contains two subsections. One groups the diagnostic signs with or without modifications per text to. The other discovers the homophonous signs used throughout the corpus and groups different usages per text</p>

First of all, let's create more columns in the data frame to aid us
<ol>
<li>mods_str - Since the data contains three columns currently with information on variable paleography, it would help us to consolidate them into one column</li>
<li>str_part and num_part - In order to determine which signs share a syllabic value, it will be useful to separate the transliterated readings into their string components and numerical components. Once we do this, we can group rows with the same str_part and count up the different usages of homophonous signs</li>

In [None]:
df['sign_form'] = df['b'].apply(lambda x: sign_index.get(x.lower(),'?'))
df['mods_str'] = df['a'] + '.' + df['f'] + '.' + df['m']

import re
def get_num_part(s):
    try:
        n = re.findall(r'[₀₁₂₃₄₅₆₇₈₉]+',s)[0]
        n = n.replace('₀','0').replace('₁','1').replace('₂','2').replace('₃','3').replace('₄','4')
        n = n.replace('₅','5').replace('₆','6').replace('₇','7').replace('₈','8').replace('₉','9')
    except:
        n = 1
    return n
def get_str_part(s):
    try:
        n = re.findall(r'[a-zA-ZšŠṣṢṭṬʾ \(\)0-9]+',s)[0]
    except:
        n = s
    return n
        
df['str_part'] = df['b'].apply(lambda x: get_str_part(x))
df['num_part'] = df['b'].apply(lambda x: get_num_part(x))
df

<h3>1.1: Collection of Modified Signs</h3>
<p>The Data Frame we have contains the entire collection of signs in the corpus. However, not every sign has variants in paleography (at least according to Parpola's data input). We only want to look at the signs which have these variants, which we will term diagnostic. In the data, they are the signs that include any type of modification</p>

In [None]:
df2 = df[~(df['f'] == '') | ~(df['a'] == '') | ~(df['m'] == '')]
mod_signs = sorted(list(df2['sign_form'].unique()))
mod_signs

We now limit our Data Frame to include ONLY these diagnostic signs.

In [None]:
df_modsigns = df[df['sign_form'].isin(mod_signs)]
#remove damaged signs too
df_modsigns = df_modsigns[df_modsigns['break'] != 'damaged']
df_modsigns

<h3>1.2: Collection of Homophonous Signs</h3>
<p>We now limit the original data frame in different way based on orthography. First we need to figure out which syllabic readings have multiple signs that can render them.</p>

In [None]:
df2 = pd.DataFrame(df.groupby(['str_part'])['num_part'].agg('nunique'))
ortho_list = list(df2[df2[('num_part')] > 1].index)
ortho_list

We need to eliminate capital letter entries because indices on logograms indicate different words and are not relevant here.

In [None]:
ortho_list = [h for h in ortho_list if len(re.findall(r'[A-Z]',h)) == 0]
ortho_list

Limit the dataframe to only these signs

In [None]:
df_ortho_signs = df[df['str_part'].isin(ortho_list)]
df_ortho_signs

<h3>2. Mixed vs. Complementary Distribution</h3>
<p>One of the goals of this project is to determine a preference for sign usage in one subgroup of the corpus versus another. To that end there is one more factor that needs to be discussed, namely the usage of these paleographic or orthographic variants within context. If the usage of these variants are context-dependent, meaning that one form or syllable is used in one context and another form or syllable in another context, it does not tell us much about the preferential usage of the signs. This is known as a complementary distribution. For example, if a scribe uses <i>li<sub>2</sub></i> only in the form of the word be-li<sub>2</sub> and the <i>li</i> sign in all other contexts, the choice of sign usage is not determined by the scribe's preference rather on scribal convention. This convention would thus be utilized by every scribe of this corpus and not help us to detect subgroups among these texts where scribes differ.</p>
<p>On the other hand, if sign form or syllable variants appear within the same contexts, it gives us the information we want on scribal writing preference or tendencies. For example, <i>ia</i> and <i>ia<sub>2</sub></i> both appear in forms of the word bēliya, meaning that a scribe had an option of orthography and incised one or the other. (NTS: I'm avoiding the term "choose" here because it is a very loaded term with implications that may be misleading here). The question then becomes whether certain texts group together based on their tendencies to use one variant within a mixed distribution versus another variant.</p>
<p>(paragraph about this dichotomy on the paleographic side of things. Mention TA vs. TA@v)</p>
<p>(closing paragraph summarizing the issue)</p>

<b>2.1 Paleographic Variant Distribution</b>

In [None]:
df_mods_agg = pd.DataFrame(df_modsigns.groupby(['sign_form','form','mods_str'])['a'].agg('count')).reset_index()
df_mods_agg.columns = ['sign_form','form','mods_str','count']
#first let's remove where total instances are less than a certain arbitrary value, say 5
df_mods_agg = df_mods_agg[df_mods_agg['count'] >= 5]
#NOW find and only keep the rows where sign_form and form are duplicates
df_mods_agg['is_dup'] = df_mods_agg.duplicated(['sign_form','form'],False)
df_mods_agg = df_mods_agg[df_mods_agg['is_dup'] == True]
df_mods_agg

In [None]:
df_select_signmods = df_mods_agg[['sign_form','mods_str']].drop_duplicates()
df_select_signmods['combined'] = df_select_signmods['sign_form'] + ':' + df_select_signmods['mods_str']
df_select_signmods

In [None]:
select_signs = list(df_mods_agg['sign_form'].unique())
select_signs

In [None]:
df_file_select_signs = df_modsigns[df_modsigns['sign_form'].isin(select_signs)]
df_file_select_signs = pd.DataFrame(df_file_select_signs.groupby(['file','sign_form','mods_str'])['a'].agg('count')).reset_index()
df_file_select_signs['combined'] = df_file_select_signs['sign_form'] + ':' + df_file_select_signs['mods_str']
df_file_select_signs

In [None]:
d_file_select_signs = {}
file_names = df_modsigns['file'].unique()
for f in file_names:
    d = {}
    e = {}
    for i, row in df_select_signmods.iterrows():
        try:
            n = int(df_file_select_signs[(df_file_select_signs['file'] == f) & (df_file_select_signs['combined'] == row['combined'])]['a']) + 1
        except TypeError:
            n = 1
            
        d[row['combined']] = n
        if row['sign_form'] in e:
            e[row['sign_form']] += n
        else:
            e[row['sign_form']] = n
        
    #d_select_signs[f] = [d['ia'] / ia_tot,d['ia₂'] / ia_tot,d['li'] / li_tot,d['li₂'] / li_tot,d['ša'] / sa_tot, d['ša₂'] / sa_tot,d['šu'] / su_tot,d['šu₂'] / su_tot]
    d_file_select_signs[f] = []
    for i,row in df_select_signmods.iterrows():
        d_file_select_signs[f].append(d[row['combined']] / e[row['sign_form']])
df_file_select_signs_c = pd.DataFrame(d_file_select_signs).transpose()
df_file_select_signs_c.columns = list(df_select_signmods['combined'])
df_file_select_signs_c

<b>2.1.2. Clustering on Paleography Alone</b>

In [None]:
from sklearn.cluster import KMeans

km1 = KMeans(n_clusters=14, max_iter=1000).fit(df_file_select_signs_c)

labels_paleo = {}

km1.labels_

for i in range(len(km1.labels_)):
    if km1.labels_[i] in labels_paleo:
        labels_paleo[km1.labels_[i]].append(file_names[i])
    else:
        labels_paleo[km1.labels_[i]] = [file_names[i]]
labels_paleo

#Sennacherib the Prince
sar = ['P334141.json','P334390.json']
#Nabu-pašir, governor of Harran
np = ['P334807.json','P334080.json']
#Nabu-deʾiq
nd = ['P334568.json','P334792.json']

def find_cluster(pnum,labels):
    for k in labels:
        if pnum in labels[k]:
            return str(k)
        
print('Sennacherib clusters are: ',find_cluster(sar[0],labels_paleo),' and ',find_cluster(sar[1],labels_paleo))
print('Nabu-pašir clusters are: ',find_cluster(np[0],labels_paleo),' and ',find_cluster(np[1],labels_paleo))
print('Nabu-deʾiq clusters are: ',find_cluster(nd[0],labels_paleo),' and ',find_cluster(nd[1],labels_paleo))

<b>2.2. Orthographic Variant Distribution</b>

In [None]:
df_ortho_signs['form_str_part'] = df_ortho_signs['form'].apply(lambda x: re.sub(r'[₁₂₃₄₅₆₇₈₉₀]','',x))
df_ortho_signs

In [None]:
df_syls_agg = pd.DataFrame(df_ortho_signs.groupby(['str_part','form_str_part','b'])['a'].agg('count')).reset_index()
df_syls_agg.columns = ['str_part','form_str_part','b','count']
#first let's remove where total instances are less than a certain arbitrary value, say 5
df_syls_agg = df_syls_agg[df_syls_agg['count'] >= 5]
#NOW find and only keep the rows where sign_form and form are duplicates
df_syls_agg['is_dup'] = df_syls_agg.duplicated(['str_part','form_str_part'],False)
df_syls_agg = df_syls_agg[df_syls_agg['is_dup'] == True]
df_syls_agg

In [None]:
df_select_bs = df_syls_agg[['str_part','b']].drop_duplicates()
#Don't need to create combined column here because b is sufficient
#df_select_signmods['combined'] = df_select_signmods['sign_form'] + ':' + df_select_signmods['mods_str']
df_select_bs

In [None]:
select_syls = list(df_syls_agg['str_part'].unique())
select_syls

In [None]:
df_file_select_bs = df_ortho_signs[df_ortho_signs['str_part'].isin(select_syls)]
df_file_select_bs = pd.DataFrame(df_file_select_bs.groupby(['file','str_part','b'])['a'].agg('count')).reset_index()
#Again combined is just b
#df_file_select_syls['combined'] = df_file_select_signs['sign_form'] + ':' + df_file_select_signs['mods_str']
df_file_select_bs

In [None]:
d_file_select_syls = {}
file_names = df_ortho_signs['file'].unique()
for f in file_names:
    d = {}
    e = {}
    for i, row in df_select_bs.iterrows():
        try:
            n = int(df_file_select_bs[(df_file_select_bs['file'] == f) & (df_file_select_bs['b'] == row['b'])]['a']) + 1
        except TypeError:
            n = 1
            
        d[row['b']] = n
        if row['str_part'] in e:
            e[row['str_part']] += n
        else:
            e[row['str_part']] = n
        
    #d_select_signs[f] = [d['ia'] / ia_tot,d['ia₂'] / ia_tot,d['li'] / li_tot,d['li₂'] / li_tot,d['ša'] / sa_tot, d['ša₂'] / sa_tot,d['šu'] / su_tot,d['šu₂'] / su_tot]
    d_file_select_syls[f] = []
    for i,row in df_select_bs.iterrows():
        d_file_select_syls[f].append(d[row['b']] / e[row['str_part']])
df_file_select_syls_c = pd.DataFrame(d_file_select_syls).transpose()
df_file_select_syls_c.columns = list(df_select_bs['b'])
df_file_select_syls_c

<b>2.2.2 Cluster using K-Means</b>

In [None]:
from sklearn.cluster import KMeans

km1 = KMeans(n_clusters=7, max_iter=1000).fit(df_file_select_syls_c)

labels_ortho = {}

km1.labels_

for i in range(len(km1.labels_)):
    if km1.labels_[i] in labels_ortho:
        labels_ortho[km1.labels_[i]].append(file_names[i])
    else:
        labels_ortho[km1.labels_[i]] = [file_names[i]]
labels_ortho

#Let's examine some test cases. We'll select three pairs of texts, which we would expect to cluster always in the same way.
#Sennacherib the Prince
sar = ['P334141.json','P334390.json']
#Nabu-pašir, governor of Harran
np = ['P334807.json','P334080.json']
#Nabu-deʾiq
nd = ['P334568.json','P334792.json']

def find_cluster(pnum,labels):
    for k in labels:
        if pnum in labels[k]:
            return str(k)

print('Sennacherib clusters are: ',find_cluster(sar[0],labels_ortho),' and ',find_cluster(sar[1],labels_ortho))
print('Nabu-pašir clusters are: ',find_cluster(np[0],labels_ortho),' and ',find_cluster(np[1],labels_ortho))
print('Nabu-deʾiq clusters are: ',find_cluster(nd[0],labels_ortho),' and ',find_cluster(nd[1],labels_ortho))

Combine Orthography and Paleography

In [None]:
tm_all = pd.concat([df_file_select_syls_c,df_file_select_signs_c],axis=1)
tm_all

In [None]:
from sklearn.cluster import KMeans

km1 = KMeans(n_clusters=14, max_iter=1000).fit(tm_all)

labels_all = {}

km1.labels_

for i in range(len(km1.labels_)):
    if km1.labels_[i] in labels_all:
        labels_all[km1.labels_[i]].append(file_names[i])
    else:
        labels_all[km1.labels_[i]] = [file_names[i]]
labels_all

#Sennacherib the Prince
sar = ['P334141.json','P334390.json']
#Nabu-pašir, governor of Harran
np = ['P334807.json','P334080.json']
#Nabu-deʾiq
nd = ['P334568.json','P334792.json']

def find_cluster(pnum,labels):
    for k in labels:
        if pnum in labels[k]:
            return str(k)

print('Sennacherib clusters are: ',find_cluster(sar[0],labels_all),' and ',find_cluster(sar[1],labels_all))
print('Nabu-pašir clusters are: ',find_cluster(np[0],labels_all),' and ',find_cluster(np[1],labels_all))
print('Nabu-deʾiq clusters are: ',find_cluster(nd[0],labels_all),' and ',find_cluster(nd[1],labels_all))

<p>So it appears that letters from the same place do not group in the same clusters according to paleographic and orthographic preferences in the letters. Why should this be? Here are some options</p>
<ol>
    <li>Scribal usage of different paleographies and orthographies is not based on a certain preference either consiously or unconsiously. In other words, for any given scribe, free variation reigns supreme (expand on this)</li>
    <li>Paleographic and Orthographic variation CAN indicate scribal tendencies, BUT computational methods are insufficient to determine this because machine learning algorithms require large amounts of data and the letters simply do not provide enough data</li>
    <li>There is a problem with my methodology. Maybe I set up the text vectors incorrectly. Maybe I should include more orthographies/paleographies or perhaps less. Maybe the number of clusters selected is wrong.</li>
</ol>

<p>Something else to keep in mind here is that while I limited the number of signs to be considered in the text vectors, I did not restrict any text from being in the corpus. Perhaps I should do that. Maybe certain texts are simply too short to make any determinations on its grouping among the other texts.</p>

Visualize with MDS

In [None]:
from sklearn.manifold import MDS

texts_2d_map = {}
texts = tm_all.index

mds1 = MDS(n_components = 2)
texts_2d = mds1.fit_transform(tm_all)

In [None]:
color_list = ['white','yellow','green','red','blue','brown','black']
colors_all = []
for i in range(len(km1.labels_)):
    colors_all.append(color_list[km1.labels_[i]])
colors_all

In [None]:
import matplotlib.pyplot as plt
plt.figure(num=None, figsize=(16, 16), dpi=80, facecolor='w', edgecolor='k')

x_values = [xy[0] for xy in texts_2d]
y_values = [xy[1] for xy in texts_2d]
plt.scatter(x_values,y_values,c=colors_all)
for i in range(len(texts_2d)):
    plt.annotate(texts[i],(x_values[i],y_values[i]))
plt.show()

-----------------------------

THE FOLLOWING CONTAINS OLDER CODE from different attempts at forming text vectors. But I'm keeping it for now in case I need it.

4 Syls Material

In [None]:
Now let's try to apply a quantitative method to figure out the level of mixed distribution which orthographic variants bear within word forms. The steps here are:
<ol>
<li>Create a dictionary that hashes each syllable to a list of its orthographic variants</li>
<li>Count up the number of instances of one orthographic variant being used in each word form</li>
<li>Match up the word forms with either variant and see the total numbers of that one word form with both variants together</li>
</ol>

df_agg = pd.DataFrame(df_ortho_signs.groupby(['str_part','b'])['a'].agg('count')).reset_index()
df_agg.columns = ['str_part','b','count']
ortho_list_tuples = {}
for i, row in df_agg.iterrows():
    if row['str_part'] in ortho_list_tuples:
        ortho_list_tuples[row['str_part']].append(row['b'])
    else:
        ortho_list_tuples[row['str_part']] = [row['b']]

#clean up the u and ṭe directly. It is easier
ortho_list_tuples['u1'] = ['u','u₂']
ortho_list_tuples['u2'] = ['u','u₃']
ortho_list_tuples['u3'] = ['u₂','u₃']
ortho_list_tuples['ṭe1'] = ['ṭe','ṭe₂']
ortho_list_tuples['ṭe2'] = ['ṭe','ṭe₃']
ortho_list_tuples['ṭe3'] = ['ṭe₂','ṭe₃']
ortho_list_tuples

df_form_counts = pd.DataFrame(df_ortho_signs.groupby(['str_part','b','form'])['a'].agg('count'))
df_form_counts = df_form_counts.reset_index()
df_form_counts = df_form_counts.sort_values(by=['b','a'],ascending=[True,False])
df_form_counts

l_mixed = []
for k in ortho_list_tuples:
    if(len(ortho_list_tuples[k]) == 2):
        df_ortho1 = df_form_counts[df_form_counts['b'] == ortho_list_tuples[k][0]]
        df_ortho2 = df_form_counts[df_form_counts['b'] == ortho_list_tuples[k][1]]
        total_count = 0
        mixed_count = 0
        for i, row1 in df_ortho1.iterrows():
            form1 = re.sub(r'' + k + '[₁₂₃₄₅₆₇₈₉₀]+',k,row1['form'])
            for j, row2 in df_ortho2.iterrows():
                total_count += 1
                form2 = re.sub(r'' + k + '[₁₂₃₄₅₆₇₈₉₀]+',k,row2['form'])
                if form1 == form2:
                    data = {
                        'str_part': k,
                        'form1': row1['form'],
                        'form1_c': row1['a'],
                        'form2': row2['form'],
                        'form2_c': row2['a'],
                        'form_base': form1,
                    }
                    l_mixed.append(data)

df_mixed = pd.DataFrame(l_mixed)
df_mixed['total_mixed'] = df_mixed['form1_c'] * df_mixed['form2_c']
df_mixed

This is the chart to look at to see which orthographic variants are being employed in a meaningful mixed distribution.
<p>(A note here about perhaps having a cutoff point for total instances for one syllable)</p>
<p>(A note here about the important orthographic variation included in this chart and how to employ it in our text vector. Do we want to look at the variants only within the word forms with sufficient number of instances or across the whole text? I would say across the whole text. Do we want to restrict the dimensions of the text vectors to only orthographic variants that we select from the chart or employ a kind of weighting system which places more importance on the syllables that are important here? For now, I'm just going to limit it to particular syllables)</p>
<p>(ALSO, do we want to match up orthographic variants in the context of word forms OR in the context of 2-grams, per David's suggestion. I did entire word forms here, because I found the programming to be easier.)</p>

df_4_syls = df_ortho_signs[df_ortho_signs['str_part'].isin(['šu','ša','ia','li'])]
df_4_syls = pd.DataFrame(df_4_syls.groupby(['file','str_part','b'])['a'].agg('count')).reset_index()
df_4_syls

d_file_4_syls = {}
file_names = df_ortho_signs['file'].unique()
for f in file_names:
    d = {}
    for s in ['ia','ia₂','šu','šu₂','ša','ša₂','li','li₂']:
        try:
            n = int(df_4_syls[(df_4_syls['file'] == f) & (df_4_syls['b'] == s)]['a']) + 1
        except TypeError:
            n = 1
        d[s] = n
    
    ia_tot = d['ia'] + d['ia₂']
    su_tot = d['šu'] + d['šu₂']
    sa_tot = d['ša'] + d['ša₂']
    li_tot = d['li'] + d['li₂']
    
    d_file_4_syls[f] = [d['ia'] / ia_tot,d['ia₂'] / ia_tot,d['li'] / li_tot,d['li₂'] / li_tot,d['ša'] / sa_tot, d['ša₂'] / sa_tot,d['šu'] / su_tot,d['šu₂'] / su_tot]
df_file_4_syls = pd.DataFrame(d_file_4_syls).transpose()
df_file_4_syls.columns = ['ia','ia₂','li','li₂','ša','ša₂','šu','šu₂']
df_file_4_syls

<h2>2. Clustering</h2>

<h3>2.1 Tf-Idf</h3>
<p>We want to gather all of the signs with their variant orthographies into each text file and generate a vector which will contain a '1' if the text contains an orthography or sign value and a '0' if it does not.</p>
<p>(Here's where I am unsure of how to form the Tf-Idf matrix. The code currently counts ALL usages of the signs per file. But do we want to...</p>
<ol><li>Only give each sign usage a 1 or 0 value?</li><li>Keep the totals and normalize the vectors?</li></ol>
<p>Another thing to consider is if we want to combine Tf-Idf matrices, i.e. the sign orthography variants and the sign syllable variants. Would this be useful? Would we need to distinguish these qualities in the vectors somehow? The same issue occurs if we tack on other variables to the text)</p>

<b>2.1.1. Modified Signs Tf-Idf</b>

In [None]:
df_file_modsigns = pd.DataFrame(df_modsigns.groupby(['file']).apply(lambda x: ' '.join(x.sign_form+':'+x.mods_str))).reset_index()
df_file_modsigns.columns = ['file','mod_signs_all']
df_file_modsigns

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

trans = TfidfTransformer(smooth_idf=False)
vect = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
vect_fit = vect.fit_transform(df_file_modsigns['mod_signs_all'])
tfidf_fit = trans.fit_transform(vect_fit)
tfidf1 = pd.DataFrame(tfidf_fit.toarray(),columns=vect.get_feature_names(),index=df_file_modsigns.file)
tfidf1

<b>2.1.2. TF-IDF for homophonous signs</b>

In [None]:
df_file_orthosigns = pd.DataFrame(df_ortho_signs.groupby(['file']).apply(lambda x: ' '.join(x.b))).reset_index()
df_file_orthosigns.columns = ['file','homo_signs_all']
df_file_orthosigns

In [None]:
vect2 = CountVectorizer(token_pattern='[^ ]+',lowercase=False)
vect_fit2 = vect2.fit_transform(df_file_orthosigns['homo_signs_all'])
tfidf2 = pd.DataFrame(vect_fit2.toarray(),columns=vect2.get_feature_names(),index=df_file_orthosigns.file)
tfidf2

<h3>2.2 K-Means Clustering</h3>
<p>Now we use K-means to cluster the texts. (K-means requires a determination of how many clusters to use. What is this number? How should we determine it?)</p>

<b>2.2.1. Clustering for Modified Signs</b>

In [None]:
from sklearn.cluster import KMeans

km1 = KMeans(n_clusters=7, max_iter=1000).fit(tfidf1)

labels_mods = {}

for i in range(len(km1.labels_)):
    if km1.labels_[i] in labels_mods:
        labels_mods[km1.labels_[i]].append(df_file_modsigns.file[i])
    else:
        labels_mods[km1.labels_[i]] = [df_file_modsigns.file[i]]
labels_mods

Same for homophone signs

<b>2.2.2. Clustering for Homophonous Signs</b>

In [None]:
km2 = KMeans(n_clusters=7, max_iter=1000).fit(tfidf2)
labels_orthos = {}

for i in range(len(km2.labels_)):
    if km2.labels_[i] in labels_homos:
        labels_orthos[km2.labels_[i]].append(df_file_orthosigns.file[i])
    else:
        labels_orthos[km2.labels_[i]] = [df_file_orthosigns.file[i]]
labels_orthos

<h3>2. Visualization</h3>

<b>2.1. Visualize from Modified Sign Clusters</b>

In [None]:
from sklearn.manifold import MDS

texts_2d_map = {}
texts = tfidf1.index

dim_num = len(vect.get_feature_names())
mds1 = MDS(n_components = 2)
texts_2d = mds1.fit_transform(tfidf1)

Set up colors for each cluster

In [None]:
color_list = ['white','yellow','green','red','blue','brown','black']
colors_all = []
for i in range(len(km1.labels_)):
    colors_all.append(color_list[km1.labels_[i]])
colors_all

In [None]:
import matplotlib.pyplot as plt
plt.figure(num=None, figsize=(16, 16), dpi=80, facecolor='w', edgecolor='k')

x_values = [xy[0] for xy in texts_2d]
y_values = [xy[1] for xy in texts_2d]
plt.scatter(x_values,y_values,c=colors_all)
for i in range(len(texts_2d)):
    plt.annotate(texts[i],(x_values[i],y_values[i]))
plt.show()

<b>2.2 Same for homophone signs</b>

In [None]:
dim_num = len(vect2.get_feature_names())
mds2 = MDS(n_components = 2)
texts_2d = mds2.fit_transform(tfidf2)

x_values = [xy[0] for xy in texts_2d]
y_values = [xy[1] for xy in texts_2d]
plt.scatter(x_values,y_values,c=colors_all)
for i in range(len(texts_2d)):
    plt.annotate(texts[i],(x_values[i],y_values[i]))
plt.show()