# Z01.6: Extra Analysis -- Split Word Explanation


This notebook contains the analysis of Token Propensities. This is the second analysis of first Scribal Intent Study (Z01).

## Setup and Utility Functions

In [25]:
# Imports and setup
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML
import os

from qlynx.file_utils import load_pkl, store_pkl
from qlynx.stats_utils import *
from qlynx.display_utils import render_html_to_image
from voynichlib.utils import display_voynichese

%reload_ext autoreload
%autoreload 2
 

# Set Global Parameters

In [26]:
do_parametric_studies = True
MAX_BAYES = np.exp(10)
MAX_PROPENSITY = 999
# THRESHOLDS = {
#     'p_value': 1,
#     'ln_bayes_factor': .0001
# }
THRESHOLDS = {
    'p_value': 0.01,
    'ln_bayes_factor': 5
}
THRESHOLDS['bayes_factor'] = np.exp(THRESHOLDS['ln_bayes_factor'])
reference_cohort = 'MIDDLE'
# smooth = 'laplace'
# smooth = 'laplace'
smooth = None



## Load the Token Cohort Data

In [27]:
file_path = 'voynich_data/outputs/token_cohort_data.pkl'
token_cohort_data = load_pkl(file_path)

cohorts = token_cohort_data['cohorts']
cohorts_with_randoms = token_cohort_data['cohorts_with_randoms']

corpus_by_c = token_cohort_data['corpus_by_c']
pmfs_by_c = token_cohort_data['pmfs_by_c']
token_ws_by_c = token_cohort_data['token_ws_by_c']

glyph_pmfs_by_c = token_cohort_data['glyph_pmfs_by_c']
glyphs_by_c = token_cohort_data['glyphs_by_c']

## Get Token Lengths for the Most Frequent Tokens

In [28]:
def get_top_vocabulary_tokens_lengths_dict(cohort, N_v:int=None):
    tokens = pmfs_by_c[cohort].values
    if not N_v:
        tokens = tokens[:N_v]
    token_lengths_dict = {}
    df = corpus_by_c[cohort].tokens_df()
    for token in tokens:
        df_token = df[df['token'] == token]
        token_length = df_token['token_length_min'].iloc[0]
        token_lengths_dict[token] = token_length
        pass
    pass
    return token_lengths_dict

## Functions to Get Probabilities of Starting or Ending with Given Token

In [29]:
def prob_starts_with(pmf, prefix_token, smooth:str = None):
    prob = 0.
    count = 0
    for token in pmf.values:
        if token.startswith(prefix_token):
            prob += pmf.prob(token, smooth=smooth)
            count += pmf.count(token)
            pass
        pass
    pass
    return prob, count   

def prob_ends_with(pmf, prefix_token, smooth:str = None):
    prob = 0.
    count = 0
    for token in pmf.values:
        if token.endswith(prefix_token):
            prob += pmf.prob(token, smooth=smooth)
            count += pmf.count(token)
            pass
        pass
    pass
    return prob, count 
        

In [30]:
prob_starts_with(pmfs_by_c['MIDDLE'], 'dy')

(0.010769634883110057, 41)

## Function to Compile DataFrame for a Cohort

In [31]:
def compile_token_propensity_df(target_cohort, reference_cohort, p_value_threshold, bayes_threshold, consider_split_words:bool = False):
    top_token_length_dict = get_top_vocabulary_tokens_lengths_dict(reference_cohort)
    df = pd.DataFrame(columns = ['token', 'glyph_count', 'N_ref', 'n_ref', 'N_x', 'n_x', 'p_ref', 'p_x', 'p_value', 'sig_p_value', 'sig_BF', 'propensity', 'bayes', 'binom_stat_le', 'binom_stat_gt'])
    for token, w in top_token_length_dict.items():
        pmf_ref = pmfs_by_c[reference_cohort]
        N_ref = pmf_ref.total_count
        n_ref = pmf_ref.count(token) if N_ref > 0 else 0
        p_ref = pmf_ref.prob(token, smooth=smooth)
        # The following is special cases for considering potential splitting of words into tokens around drawings, 
        if consider_split_words:
            if target_cohort == 'BEFORE':
                p_ref_, _ = prob_starts_with(pmf_ref, token, smooth=smooth)
                p_ref = p_ref_
            if target_cohort == 'AFTER':
                p_ref_, _ = prob_ends_with(pmf_ref, token, smooth=smooth)
                p_ref = p_ref_
            

        pmf_x = pmfs_by_c[target_cohort]                        
        N_x = pmf_x.total_count
        n_x = pmf_x.count(token) if N_x > 0 else 0
        if consider_split_words:
            if target_cohort == 'BEFORE':
                _, n_x = prob_starts_with(pmf_x, token)
            if target_cohort == 'AFTER':
                _, n_x = prob_ends_with(pmf_x, token)
            
        p_x = pmf_x.prob(token, smooth=smooth)
            

        p_value = calculate_binomial_probability(n_x, N_x, p_ref)

        bayes_factor = bayes_factor_binomial(n_x, N_x, p_x, p_ref)
        bayes_factor = min(MAX_BAYES, bayes_factor)

        binom_stat_le =  binom.cdf(n_x, N_x, p_ref)        
        binom_stat_gt =  binom.cdf(n_x, N_x, 1. -p_ref)        

        if target_cohort.startswith('Rand'):
            propensity = 1.
        else: 
            propensity = p_x/p_ref  if p_ref > 0 else MAX_PROPENSITY
            pass
        verdict_p_value = p_value < p_value_threshold
        verdict_bayes_factor = bayes_factor > bayes_threshold
        df.loc[len(df)] = [token,
                           top_token_length_dict[token],
                           N_ref,
                           n_ref,
                           N_x,
                           n_x,
                           p_ref,
                           p_x,
                           p_value,
                           verdict_p_value,
                           verdict_bayes_factor,
                           np.round(propensity,1),
                           bayes_factor,
                          binom_stat_le,
                          binom_stat_gt]
        pass
    df.set_index('token', inplace=True)
    pass
    return df



## Function to Extract Significant Propensity DF Rows

In [32]:
def extract_df(cohort, stat_type, component_type, df_by_cohort_dict):
    # print(f"'{cohort}' Compared to 'MIDDLE'")
    if component_type == 'tokens':
        df = df_by_cohort_dict[cohort]

    if stat_type == 'p_value':
        df = df[df['sig_p_value']]
        df.sort_values(by='p_value', ascending=False)
    elif stat_type == 'bayes':
        df = df[df['sig_BF']]        
        df.sort_values(by='bayes', ascending=False)
    elif stat_type == 'both':
        df = df[(df['sig_p_value']) & (df['sig_BF'])]
        
    # print(len(df))
    return df

## Function to create HTML and png Tables

In [33]:
cohort_title_dict = {
'ALL':'All in Corpus',
'MIDDLE':'Middle Positions',
'TOP':'Top Lines of Paragraphs',
'FIRST': 'First Position on a Line',
'SECOND': 'Second Position on a Line',
'THIRD': 'Third Position on a Line',
'FOURTH': 'Fourth Position on a Line',
'BEFORE': 'Immediately Before a Drawing',
'AFTER':'Immediately After a Drawing',
'LAST': 'Last Position on a Line',
'RAND 1':'Random Tokens Cohort',
'RAND 2':'Random Tokens Cohort', 
'RAND 3':'Random Tokens Cohort',
'RAND 4':'Random Tokens Cohort',
'RAND 5':'Random Tokens Cohort',
'RAND 6':'Random Tokens Cohort',
}    

def filter_and_sort_dataframe(df, propensity_col, sig_p_value_col, sig_BF_col):
    # Filter rows where at least one of sig_p_value or sig_BF is True
    filtered_df = df[(df[sig_p_value_col]) & (df[sig_BF_col])]
    # filtered_df = df[(df[sig_p_value_col])]

    # Splitting the DataFrame based on propensity values
    df_greater_than_zero = filtered_df[filtered_df[propensity_col] > 1].sort_values(by=propensity_col, ascending=False)
    df_less_than_zero = filtered_df[filtered_df[propensity_col] < 1].sort_values(by=propensity_col, ascending=True)

    # Concatenating the two DataFrames
    result_df = pd.concat([df_greater_than_zero, df_less_than_zero])

    return result_df

def display_cohort_tendency_summary(cohort: str, 
                                    component:str, 
                                    file_name: str = None, 
                                    width:int=None, 
                                    height:int=None,
                                   single_token:str = None):
    if component == 'tokens':
        df = token_propensity_dfs[cohort].sort_values(by='propensity', ascending=False)
        table_title = f"Positional Tendency Tokens<br>{cohort_title_dict[cohort]}"
        num_tokens_in_target = pmfs_by_c[cohort].total_count
        num_tokens_in_ref = pmfs_by_c[reference_cohort].total_count
        component_text = 'Tokens'
        
    elif component == 'split_words':
        df = split_word_token_propensity_dfs[cohort].sort_values(by='propensity', ascending=False)
        table_title = f"Positional Tendency Tokens<br>{cohort_title_dict[cohort]}"
        num_tokens_in_target = pmfs_by_c[cohort].total_count
        num_tokens_in_ref = pmfs_by_c[reference_cohort].total_count
        component_text = 'Tokens'
    
    elif component == 'glyphs':
        df = glyph_propensity_dfs[cohort].sort_values(by='propensity', ascending=False)
        table_title = f"Positional Tendency Glyphs<br>{cohort_title_dict[cohort]}"
        num_tokens_in_target = len(corpus_by_c[cohort].glyphs_df())
        num_tokens_in_ref = len(corpus_by_c[reference_cohort].glyphs_df())
        component_text = 'Glyphs'
        pass
    pass

    df = filter_and_sort_dataframe(df,'propensity', 'sig_p_value', 'sig_BF') 
    if single_token is not None:
        df = df.loc[[single_token]]
    
    html_top = """
<html>
<head>
    <style>
        h3 {
            margin-left: auto;
            margin-right: auto;
        }
        table {
            border: 3px solid black;
            border-collapse: collapse;
            margin-left: auto;
            margin-right: auto;
        }

        th, td {
            border: 1px solid black;
            text-align: center;
        }

       .header-row {
            background-color: #7AA4F8;
        }     
        table td, table th {
            padding-left: 5px;
            padding-right: 5px;
        }
        
        tbody tr:nth-child(even) {
            background-color: #FEEFC2; /*#FFFFD9; light beige for odd rows */
        }

        tbody tr:nth-child(odd) {
            background-color: white; /* white for even rows */
        }
    </style>
</head>
<body>"""
    html_bottom = """
</body>
</html>"""
    html = ''
    # # Start the HTML table
    # html += f"<h2>{table_title}</h2>\n"
    # html += f"Reference Cohort: {reference_cohort}<br>\n"
    # html += f"Total Count in Reference Cohort: {num_tokens_in_ref}<br>\n"
    # html += f"Total Count in Target Cohort: {num_tokens_in_target}<br>\n"
    # html += f"Total Count Selected: {len(df)}\n"
    html_table_top = """
<table style='width:600px'>
    <tr>
        <th class='header-row' colspan=1 rowspan=2 style='text-align: center;'>Tilt</th>
        <th class='header-row' colspan=2 style='text-align: center;'>Token</th>
        <th class='header-row' colspan=2 style='text-align: center;'>Counts</th>
        <th class='header-row' colspan=3 style='text-align: center;'>Stats</th>
    </tr>
    <tr>
        <th class='header-row' >Voynichese</th>
        <th class='header-row' >Eva-</th>
        <th class='header-row' >expected</th>
        <th class='header-row' >observed</th>
        <th class='header-row' >Propensity</th>
        <th class='header-row' ><i>p</i>-value</th>
        <th class='header-row' ><i>log(B)</i></th>
    </tr>"""    
    html_1 = html_table_top
    # html_2 = html_table_top
    num_affinitive = len(df[df['propensity'] >= 1])
    num_aversive = len(df[df['propensity'] < 1])
    
    color = 'black' 
    # Fill the table rows
    i=-1
    for index, row in df.iterrows(): 
        i += 1
        propensity = row['propensity']
        # ln_propensity = np.log(propensity)
        # if propensity < 1:
        #     continue
        # voynichese_value = display_voynichese(text=index, render=False)

        prob_ref= row['p_ref']
        p_value = row['p_value']
        prob_x = row['p_x']
        N_x = row['N_x']
        
        observed_count = int(row['n_x'])
        expected_count =  int(np.round(prob_ref * num_tokens_in_target))
        # expected_count =  prob_ref * num_tokens_in_target
        # propensity = f"{propensity:.1f}" if observed_count > 0 else '&infin;'
        bayes = row['bayes']
        bayes = f"{np.log(bayes):.1f}" if bayes > 0 else '0'
        if bayes=='10.0':
            bayes = '>10'
            

        starp = starb = ''
        if not row['sig_p_value']:
            starp = '*'
        if i == 0:
            color = 'green'
            html_1 += f"""
<tr>
    <td rowspan={num_affinitive} style="background-color:white;color:{color};"><b>Affinitive</b></td>"""
            pass            
        elif i == num_affinitive:
            color = 'red'
            html_1 += f"""
<tr style="border-top: 3px solid black;">
    <td rowspan={num_aversive} style="background-color:white;color:{color};"><b>Aversive</b></td>"""
        else:
            html_1 += f"""
<tr>"""
            pass
        
        voynichese_value = display_voynichese(text=index, color=color, render=False)
    #     html_1 += f"""            
    # <td style='color:{color};'>{voynichese_value}</td>
        html_1 += f"""            
    <td>{voynichese_value}</td>
    <td>{index}</td>
    <td>{expected_count}</td>
    <td>{observed_count}</td>
    <td>{propensity:.1f}</td>
    <td>{p_value:.6f}{starp}</td>
    <td>{bayes}{starb}</td>
</tr>"""
            

    html_1 += """
</table>"""
    html += html_1


    # Display the HTML table
    display(HTML(html))
    if file_name:
        current_dir = os.getcwd()
        print(f"current_dir = {current_dir}")
        html_filename = file_name + '.html'
        absolute_html_file_path = os.path.join(current_dir, html_filename)
        png_filename = file_name + '.png'
        absolute_png_file_path = os.path.join(current_dir, png_filename)
        with open(html_filename, 'w') as file:
            file.write(html_top + html + html_bottom)
            print(f"Wrote {html_filename}")
        print(f"absolute_html_file_path = {absolute_html_file_path}")
        render_html_to_image(absolute_html_file_path, absolute_png_file_path, width=width, height=height+129, crop=True)



## Create Summary DataFrames for each Cohort

### Token Propensity DataFrame

In [34]:
token_propensity_dfs = {}
print(f"smoothing: {smooth}")
print(f"Summary Dataframes, p_value_threshold= {THRESHOLDS['p_value']}, bayes_threshold={THRESHOLDS['bayes_factor']:.1f}")
N_tokens_df = pd.DataFrame(columns = ['cohort',  'N_p', 'N_p_af', 'N_p_av', 'N_b', 'N_b_af', 'N_b_av',  'N_either', 'N_both'])

for cohort in ['BEFORE', 'AFTER']:
    if cohort == 'MIDDLE':
        continue
    token_propensity_dfs[cohort] = compile_token_propensity_df(cohort, 
                                                               reference_cohort, 
                                                               THRESHOLDS['p_value'], 
                                                               THRESHOLDS['bayes_factor'])
    df = token_propensity_dfs[cohort]
    # print(f"DEBUG {cohort}:  {len(df)}")
    N_p = len(df[ df['sig_p_value']])
    N_p_af =  len(df[ df['sig_p_value'] & (df['propensity']>0) ])
    N_p_av =  len(df[ df['sig_p_value'] & (df['propensity']<0) ])
    N_b = len(df[ df['sig_BF']])
    N_b_af =  len(df[ df['sig_BF'] & (df['propensity']>0) ])
    N_b_av =  len(df[ df['sig_BF'] & (df['propensity']<0) ])
    N_either =  len(df[ df['sig_BF'] | df['sig_p_value'] ])
    N_both =  len(df[ df['sig_BF'] & df['sig_p_value'] ])
    N_tokens_df.loc[len(N_tokens_df)] = [cohort,
                       N_p,
                       N_p_af,
                       N_p_av,
                       N_b,
                       N_b_af,
                       N_b_av,
                       N_either,
                       N_both]
                                             
    pass

pass
N_tokens_df

smoothing: None
Summary Dataframes, p_value_threshold= 0.01, bayes_threshold=148.4


Unnamed: 0,cohort,N_p,N_p_af,N_p_av,N_b,N_b_af,N_b_av,N_either,N_both
0,BEFORE,9,8,0,6,5,0,9,6
1,AFTER,9,9,0,3,3,0,9,3


### Split Words Token Propensity DataFrame

In [35]:
split_word_token_propensity_dfs = {}
N_split_words_tokens_df = pd.DataFrame(columns = ['cohort',  'N_p', 'N_p_af', 'N_p_av', 'N_b', 'N_b_af', 'N_b_av',  'N_either', 'N_both'])
for cohort in ['BEFORE', 'AFTER']:
    split_word_token_propensity_dfs[cohort] = compile_token_propensity_df(cohort, 
                                                                          reference_cohort, 
                                                                          THRESHOLDS['p_value'], 
                                                                          THRESHOLDS['bayes_factor'],
                                                                          consider_split_words=True)
    df = split_word_token_propensity_dfs[cohort]
    # print(f"DEBUG {cohort}:  {len(df)}")
    N_p = len(df[ df['sig_p_value']])
    N_p_af =  len(df[ df['sig_p_value'] & (df['propensity']>0) ])
    N_p_av =  len(df[ df['sig_p_value'] & (df['propensity']<0) ])
    N_b = len(df[ df['sig_BF']])
    N_b_af =  len(df[ df['sig_BF'] & (df['propensity']>0) ])
    N_b_av =  len(df[ df['sig_BF'] & (df['propensity']<0) ])
    N_either =  len(df[ df['sig_BF'] | df['sig_p_value'] ])
    N_both =  len(df[ df['sig_BF'] & df['sig_p_value'] ])
    N_split_words_tokens_df.loc[len(N_split_words_tokens_df)] = [cohort,
                       N_p,
                       N_p_af,
                       N_p_av,
                       N_b,
                       N_b_af,
                       N_b_av,
                       N_either,
                       N_both]
    pass

pass
N_split_words_tokens_df


Unnamed: 0,cohort,N_p,N_p_af,N_p_av,N_b,N_b_af,N_b_av,N_either,N_both
0,BEFORE,13,8,0,4,4,0,13,4
1,AFTER,9,8,0,4,4,0,9,4


## Comparison of BEFORE

In [36]:
before_reg_df = extract_df('BEFORE', 'both', 'tokens', token_propensity_dfs )
before_reg_df

Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
chol,4,3807,180,349,3,0.047281,0.008596,4.677319e-05,True,True,0.2,5751.381813,4.7e-05,0.0
chor,4,3807,107,349,0,0.028106,0.0,4.775047e-05,True,True,0.0,20942.20029,4.8e-05,0.0
s,1,3807,75,349,19,0.019701,0.054441,8.424169e-05,True,True,2.8,1645.754299,0.999973,0.0
dy,2,3807,28,349,23,0.007355,0.065903,4.773959e-15,True,True,9.0,22026.465795,1.0,0.0
dam,3,3807,7,349,5,0.001839,0.014327,0.0005220567,True,True,7.8,377.795542,0.999946,0.0
qotaiin,7,3807,2,349,4,0.000525,0.011461,4.004931e-05,True,True,21.8,5089.788084,0.999999,0.0


In [37]:
before_split_df = extract_df('BEFORE', 'both', 'tokens', split_word_token_propensity_dfs )
before_split_df

Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
chol,4,3807,180,349,5,0.051484,0.008596,0.0002566245,True,True,0.2,524.804901,0.000257,0.0
dy,2,3807,28,349,23,0.01077,0.065903,1.064648e-11,True,True,6.1,22026.465795,1.0,0.0
dam,3,3807,7,349,5,0.001839,0.014327,0.0005220567,True,True,7.8,377.795542,0.999946,0.0
qotaiin,7,3807,2,349,4,0.000525,0.011461,4.004931e-05,True,True,21.8,5089.788084,0.999999,0.0


In [38]:
display_cohort_tendency_summary('BEFORE', 'tokens',  width=630, height=500)
display_cohort_tendency_summary('BEFORE', 'split_words', 'voynich_data/outputs/T_SPLIT_WORDS_token_propensities_BEFORE', width=630, height=500)


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,qotaiin,qotaiin,0,4,21.8,4e-05,8.5
Affinitive,dy,dy,3,23,9.0,0.0,>10
Affinitive,dam,dam,1,5,7.8,0.000522,5.9
Affinitive,s,s,7,19,2.8,8.4e-05,7.4
Aversive,chor,chor,10,0,0.0,4.8e-05,9.9
Aversive,chol,chol,17,3,0.2,4.7e-05,8.7


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,qotaiin,qotaiin,0,4,21.8,4e-05,8.5
Affinitive,dam,dam,1,5,7.8,0.000522,5.9
Affinitive,dy,dy,4,23,6.1,0.0,>10
Aversive,chol,chol,18,5,0.2,0.000257,6.3


current_dir = /Users/andrew/GITHUB/Z01-SOM_Histocrypt_24
Wrote voynich_data/outputs/T_SPLIT_WORDS_token_propensities_BEFORE.html
absolute_html_file_path = /Users/andrew/GITHUB/Z01-SOM_Histocrypt_24/voynich_data/outputs/T_SPLIT_WORDS_token_propensities_BEFORE.html
Document Dimensions: 630 629
Screenshot saved to: /Users/andrew/GITHUB/Z01-SOM_Histocrypt_24/voynich_data/outputs/T_SPLIT_WORDS_token_propensities_BEFORE.png
Cropped image saved as '/Users/andrew/GITHUB/Z01-SOM_Histocrypt_24/voynich_data/outputs/T_SPLIT_WORDS_token_propensities_BEFORE.png'


## Comparison of AFTER

In [39]:
after_reg_df = extract_df('AFTER', 'both', 'tokens', token_propensity_dfs )
after_reg_df

Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dchol,5,3807,3,278,6,0.000788,0.021583,1.210362e-07,True,True,27.4,22026.465795,1.0,0.0
oteey,5,3807,3,278,3,0.000788,0.010791,0.001474609,True,True,13.7,161.412632,0.999921,0.0
sol,3,3807,1,278,3,0.000263,0.010791,6.081986e-05,True,True,41.1,3771.598637,0.999999,0.0


In [40]:
after_split_df = extract_df('AFTER', 'both', 'tokens', split_word_token_propensity_dfs )
after_split_df

Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dain,4,3807,33,278,10,0.009719,0.028777,0.0004617016,True,True,3.0,283.449737,0.999892,0.0
saiin,5,3807,8,278,5,0.002364,0.014388,0.0005776829,True,True,6.1,304.840987,0.999939,0.0
dchol,5,3807,3,278,6,0.000788,0.021583,1.210362e-07,True,True,27.4,22026.465795,1.0,0.0
sol,3,3807,1,278,3,0.000263,0.010791,6.081986e-05,True,True,41.1,3771.598637,0.999999,0.0


In [41]:
display_cohort_tendency_summary('AFTER',  'tokens', width=630, height=500)
display_cohort_tendency_summary('AFTER',   'split_words',  'voynich_data/outputs/T_SPLIT_WORDS_token_propensities_AFTER', width=630, height=500)


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,sol,sol,0,3,41.1,6.1e-05,8.2
Affinitive,dchol,dchol,0,6,27.4,0.0,>10
Affinitive,oteey,oteey,0,3,13.7,0.001475,5.1


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,sol,sol,0,3,41.1,6.1e-05,8.2
Affinitive,dchol,dchol,0,6,27.4,0.0,>10
Affinitive,saiin,saiin,1,5,6.1,0.000578,5.7
Affinitive,dain,dain,3,10,3.0,0.000462,5.6


current_dir = /Users/andrew/GITHUB/Z01-SOM_Histocrypt_24
Wrote voynich_data/outputs/T_SPLIT_WORDS_token_propensities_AFTER.html
absolute_html_file_path = /Users/andrew/GITHUB/Z01-SOM_Histocrypt_24/voynich_data/outputs/T_SPLIT_WORDS_token_propensities_AFTER.html
Document Dimensions: 630 629
Screenshot saved to: /Users/andrew/GITHUB/Z01-SOM_Histocrypt_24/voynich_data/outputs/T_SPLIT_WORDS_token_propensities_AFTER.png
Cropped image saved as '/Users/andrew/GITHUB/Z01-SOM_Histocrypt_24/voynich_data/outputs/T_SPLIT_WORDS_token_propensities_AFTER.png'


## Analyzing particular tokens

In [44]:
def inspect_single_token(single_token):
    prob_reg = pmfs_by_c['MIDDLE'].prob(single_token)
    prob_split,_ = prob_starts_with(pmfs_by_c['MIDDLE'], single_token)
    print(f"Inspecting {single_token}")
    print(f"\t  Reg Prob={prob_reg:.2%}")
    print(f"\tSplit Prob={prob_split:.2%}")
    display_cohort_tendency_summary('BEFORE', 'tokens', single_token=single_token)
    display_cohort_tendency_summary('BEFORE', 'split_words', single_token=single_token)


### Inspecting dy

In [45]:
inspect_single_token('dy')

Inspecting dy
	  Reg Prob=0.74%
	Split Prob=1.08%


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,dy,dy,3,23,9.0,0.0,>10


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,dy,dy,4,23,6.1,0.0,>10


In [46]:
before_reg_df[before_reg_df.index == 'dy']


Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dy,2,3807,28,349,23,0.007355,0.065903,4.773959e-15,True,True,9.0,22026.465795,1.0,0.0


In [47]:
before_split_df[before_split_df.index == 'dy']


Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dy,2,3807,28,349,23,0.01077,0.065903,1.064648e-11,True,True,6.1,22026.465795,1.0,0.0


### Inspecting chor

In [48]:
inspect_single_token('chor')

Inspecting chor
	  Reg Prob=2.81%
	Split Prob=3.02%


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,chor,chor,10,0,0.0,4.8e-05,9.9


KeyError: "None of [Index(['chor'], dtype='object', name='token')] are in the [index]"

In [None]:
before_reg_df[before_reg_df.index == 'chor']


In [111]:
before_split_df[before_split_df.index == 'chor']


Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
chor,4,3807,107,349,0,0.030208,0.0,2.2e-05,True,True,0.0,22026.465795,2.2e-05,0.0


### Inspecting cheo

In [115]:
inspect_single_token('cheo')

Inspecting cheo
	  Reg Prob=0.26%
	Split Prob=2.29%


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,cheo,cheo,1,0,0.0,0.399341,0.9


Tilt,Token,Token,Counts,Counts,Stats,Stats,Stats
Tilt,Voynichese,Eva-,expected,observed,Propensity,p-value,log(B)
Affinitive,cheo,cheo,8,0,0.0,0.000313,8.1


In [120]:
before_reg_df[before_reg_df.index == 'cheo']


Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cheo,4,3807,10,349,0,0.002627,0.0,0.399341,True,True,0.0,2.504123,0.399341,0.0


In [114]:
before_split_df[before_split_df.index == 'cheo']


Unnamed: 0_level_0,glyph_count,N_ref,n_ref,N_x,n_x,p_ref,p_x,p_value,sig_p_value,sig_BF,propensity,bayes,binom_stat_le,binom_stat_gt
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cheo,4,3807,10,349,0,0.022853,0.0,0.000313,True,True,0.0,3191.082185,0.000313,0.0


In [49]:
inspect_single_token('dain')

Inspecting dain
	  Reg Prob=0.87%
	Split Prob=0.87%


KeyError: "None of [Index(['dain'], dtype='object', name='token')] are in the [index]"