# Z01.2: Token Cohorts

This notebook sets up the various Token Cohorts for the first Scribal Intent Study (Z01).

## Setup and Utility Functions

In [1]:
# Imports and setup
import collections
import random

import numpy as np
import pandas as pd

from qlynx.file_utils import store_pkl, load_pkl
from voynichlib.Corpus import Corpus
from voynichlib.ProbMassFunction import ProbMassFunction

%reload_ext autoreload
%autoreload 2


## Load Study Corpus for Analysis of Positional Tokens

In [2]:
study_corpus_file = 'voynich_data/outputs/Study_Corpus_for_Positional_Tokens_Analysis.pkl'
corpus = Corpus.from_file(study_corpus_file)
transliteration = corpus.transliteration

--------------------------------------------------------------------------------
Corpus: Study Corpus for Positional Tokens Analysis
	From Transliteration file 'voynich_data/standard_ivtff/ZL_N_ext_Eva_3a'
		Alphabet:'Eva-' (ZL),    Alphabet Version:2.0
		Transliteration Version: 2.0
Selection Criteria: {
    "fagin_davis_scribes": [
        "1"
    ],
    "illustrations": "H",
    "locus_generic_types": "P",
    "paragraph_end_token": false,
    "unambiguous_token": true
}
	Num folios : 		        95
	Num lines  : 		     1,223
	Num tokens : 		     7,660
	Num glyphs : 		    36,177
	Num Unique tokens: 	     2,355
	Num Unique glyphs: 	        67
--------------------------------------------------------------------------------


## Setting Up Cohorts

### Setup up dictionaries

In [3]:
# Define all Cohort Criteria 
criteria_by_c = collections.OrderedDict()
criteria_by_c['ALL'] = {}
criteria_by_c['MIDDLE'] =  {'token_positioning': 'middle', 
                            'last_token': False, 
                            'pre_drawing_token': False, 
                            'post_drawing_token': False, 
                            'paragraph_end_token': False, 
                            'paragraph_start_line': False}

criteria_by_c['TOP'] =     {'token_positioning': 'middle', 
                            'last_token': False, 
                            'pre_drawing_token': False, 
                            'post_drawing_token': False, 
                            'paragraph_end_token': False, 
                            'paragraph_start_line': True}

criteria_by_c['FIRST'] =   {'token_pos': 1,               
                            'last_token': False, 
                            'pre_drawing_token': False, 
                            'post_drawing_token': False, 
                            'paragraph_end_token': False, 
                            'paragraph_start_line': False}

criteria_by_c['LAST'] =    {'last_token': True,  
                            'pre_drawing_token': False, 
                            'post_drawing_token': False, 
                            'paragraph_end_token': False, 
                            'paragraph_start_line': False}

criteria_by_c['BEFORE'] =  {'last_token': False, 
                            'pre_drawing_token': True,  
                            'post_drawing_token': False, 
                            'paragraph_end_token': False, 
                            'paragraph_start_line': False}

criteria_by_c['AFTER'] =   {'last_token': False, 
                            'pre_drawing_token': False, 
                            'post_drawing_token': True,  
                            'paragraph_end_token': False, 
                            'paragraph_start_line': False}

criteria_by_c['SECOND'] =  {'token_pos': 2,               
                            'last_token': False, 
                            'pre_drawing_token': False, 
                            'post_drawing_token': False, 
                            'paragraph_end_token': False, 
                            'paragraph_start_line': False}

criteria_by_c['FOURTH'] =  {'token_pos': 4,               
                            'last_token': False, 
                            'pre_drawing_token': False, 
                            'post_drawing_token': False, 
                            'paragraph_end_token': False, 
                            'paragraph_start_line': False}


In [4]:
# Complile Dictionaries
all_cohorts = [k for k in criteria_by_c]
cohorts = [k for k in criteria_by_c]
cohorts.remove('ALL')

corpus_by_c = {}
pmfs_by_c = {}
pmfs_by_cw = {}
tokens_by_c = {}
tokens_by_cw = {}
token_ws_by_c = {}

glyph_pmfs_by_c = {}
glyphs_by_c = {}

for cohort, criteria in criteria_by_c.items():
    label = f"'{cohort}'"
    corpus_by_c[cohort] = Corpus.from_corpus(f'Scribe 1 - {cohort}', corpus, criteria=criteria, suppress_summary=True )
    df = corpus_by_c[cohort].tokens_df()
    pmfs_by_c[cohort] = ProbMassFunction(list(df['token']))
    tokens_by_c[cohort] = pmfs_by_c[cohort].values
    token_lengths = corpus_by_c[cohort].tokens_df()['token_length_min' ]
    print(f"Scribe 1, {label:20}: {np.mean(token_lengths):.2f} +/- {np.std(token_lengths):.2f}  [{np.min(token_lengths)}, {np.max(token_lengths)}]\t\t{len(token_lengths): >8,} obs")


    glyph_df = corpus_by_c[cohort].glyphs_df()
    glyph_pmfs_by_c[cohort] = ProbMassFunction(list(glyph_df['glyph']))
    glyphs_by_c[cohort] = glyph_pmfs_by_c[cohort].values
   
    token_ws_by_c[cohort]  = token_lengths

    pmfs_by_cw[cohort] = {}
    tokens_by_cw[cohort] = {}
    for w in range(1,11):
        tokens_for_pn = list(df[df['token_length_min'] == w]['token'])
        tokens_by_cw[cohort][w] = tokens_for_pn
        pmfs_by_cw[cohort][w] = ProbMassFunction(tokens_for_pn)
        pass
    pass
    
   
pass


Scribe 1, 'ALL'               : 4.72 +/- 1.79  [1, 13]		   7,660 obs
Scribe 1, 'MIDDLE'            : 4.58 +/- 1.69  [1, 12]		   3,807 obs
Scribe 1, 'TOP'               : 5.04 +/- 1.81  [1, 11]		     847 obs
Scribe 1, 'FIRST'             : 5.13 +/- 1.75  [1, 11]		     998 obs
Scribe 1, 'LAST'              : 4.47 +/- 1.91  [1, 13]		     777 obs
Scribe 1, 'BEFORE'            : 4.24 +/- 1.96  [1, 9]		     349 obs
Scribe 1, 'AFTER'             : 4.66 +/- 1.63  [1, 9]		     278 obs
Scribe 1, 'SECOND'            : 4.66 +/- 1.65  [1, 10]		     970 obs
Scribe 1, 'FOURTH'            : 4.68 +/- 1.72  [1, 12]		     691 obs


### Perform Some Checks

In [5]:
print(f"Total tokens count in MIDDLE cohort = {pmfs_by_c['MIDDLE'].total_count:,}")
print(f"Number of occurrences of 'daiin' in MIDDLE cohort = {pmfs_by_c['MIDDLE'].count('daiin'):,}")
print(f"Probability of 'daiin' in MIDDLE cohort (no smoothing)= {pmfs_by_c['MIDDLE'].prob('daiin'):.2%}")
print(f"Probability of 'daiin' in MIDDLE cohort (laplace smoothing)= {pmfs_by_c['MIDDLE'].prob('daiin', smooth='laplace'):.2%}")
print(f"Probability of 'daiin' in MIDDLE cohort (minimal smoothing)= {pmfs_by_c['MIDDLE'].prob('daiin', smooth='minimal'):.2%}")
print()
print(f"Total tokens count in FIRST cohort = {pmfs_by_c['FIRST'].total_count:,}")
print(f"Number of occurrences of 'daiin' in FIRST cohort = {pmfs_by_c['FIRST'].count('daiin'):,}")
print(f"Probability of 'daiin' in FIRST cohort (no smoothing)= {pmfs_by_c['FIRST'].prob('daiin'):.2%}")
print(f"Probability of 'daiin' in FIRST cohort (laplace smoothing)= {pmfs_by_c['FIRST'].prob('daiin', smooth='laplace'):.2%}")
print(f"Probability of 'daiin' in FIRST cohort (minimal smoothing)= {pmfs_by_c['FIRST'].prob('daiin', smooth='minimal'):.2%}")





Total tokens count in MIDDLE cohort = 3,807
Number of occurrences of 'daiin' in MIDDLE cohort = 178
Probability of 'daiin' in MIDDLE cohort (no smoothing)= 4.68%
Probability of 'daiin' in MIDDLE cohort (laplace smoothing)= 2.38%
Probability of 'daiin' in MIDDLE cohort (minimal smoothing)= 4.72%

Total tokens count in FIRST cohort = 998
Number of occurrences of 'daiin' in FIRST cohort = 42
Probability of 'daiin' in FIRST cohort (no smoothing)= 4.21%
Probability of 'daiin' in FIRST cohort (laplace smoothing)= 2.20%
Probability of 'daiin' in FIRST cohort (minimal smoothing)= 4.29%


## Set Up Random Cohorts Based on Middle Position Tokens

### Set Random Seed
This is so that results are repeatable (based on size of MIDDLE)

In [6]:
random.seed(20240115)

In [7]:
random_cohorts = ['RAND 1', 'RAND 2', 'RAND 3', 'RAND 4', 'RAND 5', 'RAND 6']
num_in_second = len(corpus_by_c['SECOND'].tokens_df())
num_in_pre = len(corpus_by_c['BEFORE'].tokens_df())
tokens_from_mid = corpus_by_c['MIDDLE'].tokens()
tokens_rand = {}
df = corpus_by_c['MIDDLE'].tokens_df()
for i, cohort in enumerate(random_cohorts):
    num_to_sample = num_in_second if i <3 else num_in_pre
    tokens_rand[cohort] = random.sample(tokens_from_mid, num_to_sample)
    pmfs_by_c[cohort] = ProbMassFunction(tokens_rand[cohort])
    pmfs_by_cw[cohort] = {}
    tokens_by_cw[cohort] = {}
    for w in range(1,11):
        tokens_by_cw[cohort][w] = [x for x in tokens_rand[cohort] if len(x)==w ]
        if not tokens_by_cw[cohort][w]:
            print(f"tokens_by_cw[{cohort}][{w}] is NONE")
        pmfs_by_cw[cohort][w] = ProbMassFunction(tokens_by_cw[cohort][w])
        pass
    pass
    token_lengths = []
    for token in tokens_rand[cohort]:
        token_length_df =df[df['token']==token]
        if len(token_length_df)==0:
            print(token)
        else:
            token_length = token_length_df['token_length_min'].iloc[0]
        token_lengths.append(token_length)
        pass
    token_ws_by_c[cohort] = token_lengths
pass
cohorts_with_randoms = cohorts + random_cohorts



## Summary of Token Cohorts

In [8]:
def make_cohort_summary_table(cohorts):
    df = pd.DataFrame(columns=['Cohort', 'Folios', 'Lines', 'Tokens', 'Unique Tokens', 'Glyphs', 'Unique Glyphs'])#, index=['X', 'Y', 'Z'])
    for cohort in cohorts:
        if cohort.startswith('RAND'):
            corpus = corpus_by_c['MIDDLE'] 
            tokens = tokens_rand[cohort] 
            count_folios = '~'
            count_lines =  '~'
            count_tokens = len(tokens)
            count_glyphs = '~'
            ucount_tokens = len(set(list(tokens)))
            ucount_glyphs = '~'
            df.loc[len(df)] = [cohort,
                               count_folios,
                               count_lines,
                               count_tokens,
                               ucount_tokens,
                               count_glyphs,
                               ucount_glyphs]

        else:
            corpus = corpus_by_c[cohort] 
            count_folios = len(corpus.folios_df())
            count_lines = len(corpus.lines_df())
            count_tokens = len(corpus.tokens_df())
            count_glyphs = len(corpus.glyphs_df())
            ucount_tokens = len(set(list(corpus.tokens_df()['token'])))
            ucount_glyphs = len(list(set(corpus.glyphs_df()['glyph'])))
            df.loc[len(df)] = [cohort,
                               count_folios,
                               count_lines,
                               count_tokens,
                               ucount_tokens,
                               count_glyphs,
                               ucount_glyphs]
            
    
    df.to_csv('voynich_data/outputs/cohort_summary_data.csv')
    return df

make_cohort_summary_table(['ALL'] + cohorts_with_randoms)


Unnamed: 0,Cohort,Folios,Lines,Tokens,Unique Tokens,Glyphs,Unique Glyphs
0,ALL,95,1223,7660,2355,36177,67
1,MIDDLE,95,1002,3807,1115,17436,39
2,TOP,95,178,847,506,4267,29
3,FIRST,95,998,998,532,5118,25
4,LAST,95,777,777,427,3470,26
5,BEFORE,71,330,349,215,1481,20
6,AFTER,63,264,278,160,1295,19
7,SECOND,95,970,970,433,4517,22
8,FOURTH,95,691,691,339,3236,23
9,RAND 1,~,~,970,444,~,~


## Some Data Inspection

In [9]:
df = corpus_by_c['ALL']
df.tokens_df()

Unnamed: 0,folio,line_num,token_pos,token,unambiguous,token_pos_uncertainty,token_pos_from_end,last_token,token_length_min,token_length_max,fagin_davis_scribe,currier_hand,unidentified_glyph_count,uncertain_glyph_count,ligature_count,pre_drawing,post_drawing,paragraph_start,paragraph_end
0,f1v,1,1,kchsy,True,0,-9,False,5,5,1,1,0,0,0,False,False,True,False
1,f1v,1,2,chydaiin,True,0,-8,False,8,8,1,1,0,0,0,False,False,False,False
2,f1v,1,3,ol,True,0,-7,False,2,2,1,1,0,0,0,True,False,False,False
3,f1v,1,4,o,True,0,-6,False,1,1,1,1,0,0,0,False,True,False,False
4,f1v,1,5,l,True,1,-5,False,1,1,1,1,0,0,0,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7655,f96v,11,4,so{cthh},True,0,-1,True,6,6,1,4,0,0,1,False,False,False,False
7656,f96v,12,1,sosar,True,0,-3,False,5,5,1,4,0,0,0,False,False,False,False
7657,f96v,12,2,cheekeo,True,0,-2,False,7,7,1,4,0,0,0,False,False,False,False
7658,f96v,12,3,dain,True,0,-1,True,4,4,1,4,0,0,0,False,False,False,False


In [10]:
df = corpus_by_c['FIRST']
df.tokens_df()

Unnamed: 0,folio,line_num,token_pos,token,unambiguous,token_pos_uncertainty,token_pos_from_end,last_token,token_length_min,token_length_max,fagin_davis_scribe,currier_hand,unidentified_glyph_count,uncertain_glyph_count,ligature_count,pre_drawing,post_drawing,paragraph_start,paragraph_end
0,f1v,2,1,yteey,True,0,-8,False,5,5,1,1,0,0,0,False,False,False,False
1,f1v,3,1,da,True,0,-8,False,2,2,1,1,0,0,0,False,False,False,False
2,f1v,4,1,dol,True,0,-6,False,3,3,1,1,0,0,0,False,False,False,False
3,f1v,6,1,choky,True,0,-11,False,5,5,1,1,0,0,0,False,False,False,False
4,f1v,7,1,qo,True,0,-12,False,2,2,1,1,0,0,0,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,f96v,9,1,oeol,True,0,-3,False,4,4,1,4,0,0,0,False,False,False,False
994,f96v,10,1,ctheor,True,0,-4,False,6,6,1,4,0,0,0,False,False,False,False
995,f96v,11,1,sar,True,0,-4,False,3,3,1,4,0,0,0,False,False,False,False
996,f96v,12,1,sosar,True,0,-3,False,5,5,1,4,0,0,0,False,False,False,False


In [11]:
corpus_by_c['BEFORE'].tokens_df()

Unnamed: 0,folio,line_num,token_pos,token,unambiguous,token_pos_uncertainty,token_pos_from_end,last_token,token_length_min,token_length_max,fagin_davis_scribe,currier_hand,unidentified_glyph_count,uncertain_glyph_count,ligature_count,pre_drawing,post_drawing,paragraph_start,paragraph_end
0,f1v,2,4,ochy,True,0,-5,False,4,4,1,1,0,0,0,True,False,False,False
1,f1v,3,4,shy,True,1,-5,False,3,3,1,1,0,0,0,True,False,False,False
2,f1v,4,4,dam,True,0,-3,False,3,3,1,1,0,0,0,True,False,False,False
3,f1v,6,6,okal,True,2,-6,False,4,4,1,1,0,0,0,True,False,False,False
4,f1v,7,6,cthey,True,1,-7,False,5,5,1,1,0,0,0,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,f93v,9,7,oly,True,0,-4,False,3,3,1,4,0,0,0,True,False,False,False
345,f96r,11,4,qokchod,True,0,-4,False,7,7,1,4,0,0,0,True,False,False,False
346,f96r,12,4,ol,True,0,-5,False,2,2,1,4,0,0,0,True,False,False,False
347,f96v,2,4,dteodoiin,True,0,-5,False,9,9,1,4,0,0,0,True,False,False,False


In [12]:
corpus_by_c['AFTER'].tokens_df()

Unnamed: 0,folio,line_num,token_pos,token,unambiguous,token_pos_uncertainty,token_pos_from_end,last_token,token_length_min,token_length_max,fagin_davis_scribe,currier_hand,unidentified_glyph_count,uncertain_glyph_count,ligature_count,pre_drawing,post_drawing,paragraph_start,paragraph_end
0,f1v,2,5,dcho,True,0,-4,False,4,4,1,1,0,0,0,False,True,False,False
1,f1v,3,5,dksheey,True,1,-4,False,7,7,1,1,0,0,0,False,True,False,False
2,f1v,4,5,sochey,True,0,-2,False,6,6,1,1,0,0,0,False,True,False,False
3,f1v,6,7,dolchey,True,2,-5,False,7,7,1,1,0,0,0,False,True,False,False
4,f1v,7,7,ykol,True,1,-6,False,4,4,1,1,0,0,0,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,f93v,7,7,shodain,True,0,-2,False,7,7,1,4,0,0,0,False,True,False,False
274,f93v,8,8,dsheog,True,0,-3,False,6,6,1,4,0,0,0,False,True,False,False
275,f96r,11,6,aiin,True,0,-2,False,4,4,1,4,0,0,0,False,True,False,False
276,f96v,2,6,qoches,True,0,-3,False,6,6,1,4,0,0,0,False,True,False,False


## Save All 

In [13]:
token_cohort_data = {}
token_cohort_data['all_cohorts'] = all_cohorts
token_cohort_data['cohorts'] = cohorts
token_cohort_data['cohorts_with_randoms'] = cohorts_with_randoms

token_cohort_data['corpus_by_c'] = corpus_by_c
token_cohort_data['pmfs_by_c'] = pmfs_by_c
token_cohort_data['pmfs_by_cw'] = pmfs_by_cw
token_cohort_data['tokens_by_cw'] = tokens_by_cw
token_cohort_data['token_ws_by_c'] = token_ws_by_c

token_cohort_data['glyph_pmfs_by_c'] = glyph_pmfs_by_c
token_cohort_data['glyphs_by_c'] = glyphs_by_c

file_path = 'voynich_data/outputs/token_cohort_data.pkl'

store_pkl(token_cohort_data, file_path, ensure_dir = True)


## Reload Check

In [14]:
token_cohort_data = None
x_token_cohort_data = load_pkl(file_path)
x_token_cohort_data.keys()

dict_keys(['all_cohorts', 'cohorts', 'cohorts_with_randoms', 'corpus_by_c', 'pmfs_by_c', 'pmfs_by_cw', 'tokens_by_cw', 'token_ws_by_c', 'glyph_pmfs_by_c', 'glyphs_by_c'])