# Description 
This notebook mainly provides the code for answering the following research questions:

        3) How can formulas be identified?
                a) What are the most frequent or significant formulas across charters
                b) How are formulas distributed across different sections of charters

# Imports and settings

In [1]:
import pandas as pd

In [2]:
from formutils import *

In [3]:
df_full_no_punct = pd.read_json("../data-push/0d-sampling/df-merged-nopunct.json", encoding="utf-8")

df_main = df_full_no_punct[df_full_no_punct["type"] == "main"]
df_main_sample = df_main.sample(n=700, random_state=42)

text = "".join(df_main["text"].to_list()).lower()
text_sample = "".join(df_main_sample["text"].to_list()).lower()

# What are the top n-grams?

In [4]:
n = 3
k = 10

top_grams = get_top_ngrams(text, n, k, count=True)
top_grams_df = dict_to_dataframe(ngrams_to_dict(top_grams), columns=["Formula", "Count"])
# top_grams_df.to_latex(f"../data-push/3-formulas/top_{n}_grams.tex", 
#     index=False,
#     column_format="l S[table-format=4.2]"
# )
top_grams_df

Unnamed: 0,Formula,Count
0,oder horent lesen,2696
1,darnach in dem,2673
2,ze sant gallen,2559
3,an disen brief,1926
4,nach in dem,1831
5,allen den die,1822
6,dar nach in,1803
7,man zalt von,1722
8,do man zalt,1651
9,an disem brief,1547


In [5]:
n = 4
k = 10

top_grams = get_top_ngrams(text, n, k, count=True)
top_grams_df = dict_to_dataframe(ngrams_to_dict(top_grams), columns=["Formula", "Count"])
# top_grams_df.to_latex(f"../data-push/3-formulas/top_{n}_grams.tex", 
#     index=False,
#     column_format="l S[table-format=4.2]"
# )
top_grams_df

Unnamed: 0,Formula,Count
0,dar nach in dem,1769
1,do man zalt von,1561
2,lesent oder horent lesen,1324
3,jar darnach in dem,1189
4,den die disen brief,1019
5,gotzhus ze sant gallen,924
6,sehent oder horent lesen,861
7,an disen brief der,850
8,der brief ist geben,821
9,allen den die disen,814


In [6]:
n = 5
k = 10

top_grams = get_top_ngrams(text, n, k, count=True)
top_grams_df = dict_to_dataframe(ngrams_to_dict(top_grams), columns=["Formula", "Count"])
# top_grams_df.to_latex(f"../data-push/3-formulas/top_{n}_grams.tex", 
#     index=False,
#     column_format="l S[table-format=4.2]"
# )
top_grams_df

Unnamed: 0,Formula,Count
0,jar dar nach in dem,801
1,an disen brief der geben,707
2,hundert jar darnach in dem,631
3,disen brief der geben ist,541
4,do man zalt von gottes,533
5,allen den die disen brief,495
6,man zalt von gottes geburt,493
7,jar vnd darnach in dem,479
8,iar dar nach in dem,461
9,vnd tun chunt allen den,444


In [7]:
n = 6
k = 10

top_grams = get_top_ngrams(text, n, k, count=True)
top_grams_df = dict_to_dataframe(ngrams_to_dict(top_grams), columns=["Formula", "Count"])
# top_grams_df.to_latex(f"../data-push/3-formulas/top_{n}_grams.tex", 
#     index=False,
#     column_format="l S[table-format=4.2]"
# )
top_grams_df

Unnamed: 0,Formula,Count
0,an disen brief der geben ist,532
1,do man zalt von gottes geburt,486
2,disen brief der geben ist ze,375
3,in dem jar do man zalt,372
4,hundert jar dar nach in dem,370
5,dem jar do man zalt von,370
6,man zalt von gottes geburt druzehenhundert,348
7,gehenkt an disen brief der geben,316
8,fur mich vnd fur alle min,312
9,fur vns vnd fur alle vnser,303


In [8]:
n = 7
k = 10

top_grams = get_top_ngrams(text, n, k, count=True)
top_grams_df = dict_to_dataframe(ngrams_to_dict(top_grams), columns=["Formula", "Count"])
# top_grams_df.to_latex(f"../data-push/3-formulas/top_{n}_grams.tex", 
#     index=False,
#     column_format="l S[table-format=4.2]"
# )
top_grams_df

Unnamed: 0,Formula,Count
0,an disen brief der geben ist ze,369
1,in dem jar do man zalt von,368
2,do man zalt von gottes geburt druzehenhundert,346
3,fur mich vnd fur alle min erben,299
4,man zalt von gottes geburt druzehenhundert jar,298
5,gehenkt an disen brief der geben ist,277
6,die disen brief ansehent lesent oder horent,249
7,disen brief ansehent lesent oder horent lesen,248
8,den die disen brief ansehent lesent oder,227
9,dem jar do man zalt von gottes,200


In [9]:
n = 8
k = 10

top_grams = get_top_ngrams(text, n, k, count=True)
top_grams_df = dict_to_dataframe(ngrams_to_dict(top_grams), columns=["Formula", "Count"])
# top_grams_df.to_latex(f"../data-push/3-formulas/top_{n}_grams.tex", 
#     index=False,
#     column_format="l S[table-format=4.2]"
# )
top_grams_df

Unnamed: 0,Formula,Count
0,do man zalt von gottes geburt druzehenhundert jar,297
1,die disen brief ansehent lesent oder horent lesen,248
2,den die disen brief ansehent lesent oder horent,202
3,in dem jar do man zalt von gottes,200
4,dem jar do man zalt von gottes geburt,194
5,gehenkt an disen brief der geben ist ze,178
6,jar do man zalt von gottes geburt druzehenhundert,158
7,offenlich gehenkt an disen brief der geben ist,148
8,den die disen brief lesent oder horent lesen,143
9,allen den die disen brief lesent oder horent,142


# What are the top n-grams by association score?

In [10]:
n = 3
min_freq = 100

scored_ngrams = get_ngram_by_score(text, n=n, min_freq=min_freq)
scored_ngrams_df = dict_to_dataframe(ngrams_to_dict(scored_ngrams), columns=["Formula", "Score"])
#scored_ngrams_df.to_latex(f"../data-push/3-formulas/top_{n}_scored_grams.tex", index=False)
scored_ngrams_df

Unnamed: 0,Formula,Score
0,do man zalt,0.106242
1,oder horent lesen,0.103772
2,gottes geburt druzehenhundert,0.092829
3,wol getun mochten,0.058418
4,lesent oder horent,0.055447
...,...,...
1143,vnd redlich ze,0.000466
1144,vnd daz in,0.000461
1145,der rat vnd,0.000451
1146,vnd von allen,0.000429


In [11]:
n = 4
min_freq = 100

scored_ngrams = get_ngram_by_score(text, n=n, min_freq=min_freq)
scored_ngrams_df = dict_to_dataframe(ngrams_to_dict(scored_ngrams), columns=["Formula", "Score"])
#scored_ngrams_df.to_latex(f"../data-push/3-formulas/top_{n}_scored_grams.tex", index=False)
scored_ngrams_df

Unnamed: 0,Formula,Score
0,lesent oder horent lesen,0.044824
1,sehent oder horent lesen,0.028632
2,gottes geburt druzehenhundert jar,0.024113
3,do man zalt von,0.021126
4,dar nach in dem,0.016146
...,...,...
481,ze steyr vnd ze,0.000391
482,ze veld vnd ze,0.000391
483,vnd gewonhaiten vnd mit,0.000384
484,ze haben vnd ze,0.000381


In [12]:
n = 5
min_freq = 100

scored_ngrams = get_ngram_by_score(text, n=n, min_freq=min_freq)
scored_ngrams_df = dict_to_dataframe(ngrams_to_dict(scored_ngrams), columns=["Formula", "Score"])
##scored_ngrams_df.to_latex(f"../data-push/3-formulas/top_{n}_scored_grams.tex", index=False)
scored_ngrams_df

Unnamed: 0,Formula,Score
0,ansehent lesent oder horent lesen,0.012692
1,sehent lesent oder horent lesen,0.008588
2,do man zalt von gottes,0.006970
3,man zalt von gottes geburt,0.006936
4,prief lesent oder horent lesen,0.006921
...,...,...
243,mit meinem insigel vnd mit,0.000412
244,ze haben vnd allen irn,0.000405
245,mit allen rehten vnd mit,0.000398
246,hundert jar vnd in dem,0.000395


# Get skip-grams

In [13]:
n = 5
k = 2
lower = 1

result = get_skip_grams_by_score(text_sample, n, k, lower)
candidates = scored_ngrams_to_dict(result)
candidates_df = dict_to_dataframe(candidates, columns=["Formula", "Score"])
#candidates_df.to_latex(f"../data-push/3-formulas/top_{n}_skip_grams.tex", index=False)
candidates_df

Unnamed: 0,Formula,Score
0,den die in sehent (horent) oder,280.206613
1,brief sehent (lesent) oder horent lesen,278.343561
2,abt (herman) des gotzhus ze sant,263.208096
3,in sehent (lesent) oder horent lesen,255.964126
4,fur mich vnd (fur) (alle) min erben,249.710282
...,...,...
5613,daz (in) (daz) von vns vnd von,9.201039
5614,vnd daz (in) daz (also) stat vnd,9.102974
5615,vnd (richtern) (vnd) den die an den,9.086398
5616,vnd daz in daz (alles) (stat) von,8.759024


In [14]:
n = 6
k = 3
lower = 1

result = get_skip_grams_by_score(text_sample, n, k, lower)
candidates = scored_ngrams_to_dict(result)
candidates_df = dict_to_dataframe(candidates, columns=["Formula", "Score"])
#candidates_df.to_latex(f"../data-push/3-formulas/top_{n}_skip_grams.tex", index=False)
candidates_df

Unnamed: 0,Formula,Score
0,disen brief ansehent (lesent) oder horent lesen,234.022651
1,disen brief sehent (lesent) oder horent lesen,229.053391
2,dreuczehen hundert jar (vnd) darnach in dem,212.939827
3,fur mich vnd (fur) alle min erben,207.416796
4,die disen brief ansehent (lesent) oder horent,205.553721
...,...,...
9309,vnd (losen) von (allem) dem schaden in den,11.674458
9310,vnd (vnbesucht) (vnd) mit allen den rechten vnd,11.669552
9311,di (guter) (di) von in und von dem,11.642846
9312,vnd (vnsru) (kind) (vnd) der erben vnd alle ir,11.609376


# How are charters distributed across sections?

In [15]:
n = 5
percentiles = 10
k = 2

segmented_ngrams = get_segmented_ngrams(df_main, n=n, percentiles=percentiles, most_common=k)
segmented_ngrams_df = rename_columns(
    pd.DataFrame.from_dict(segmented_ngrams, orient="index")
    .applymap(modify_cell)
    )
segmented_ngrams_df.index = segmented_ngrams_df.index + 1
segmented_ngrams_df = segmented_ngrams_df.reset_index()
segmented_ngrams_df.columns = ["Segment", "1", "2"]
#segmented_ngrams_df.to_latex(f"../data-push/3-formulas/top_{n}_gram_segments_{k}_p_{str(percentiles)}.tex", index=False)
segmented_ngrams_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"segment_ngrams_{n}"] = df["text"].apply(segment_ngrams, n=n, percentiles=percentiles)


Unnamed: 0,Segment,1,2
0,1,"allen den die disen brief, 951","vnd tun chunt allen den, 390"
1,2,"zu der zeit do wir, 113","sehent oder horent lesen daz, 89"
2,3,"swie so daz genant ist, 48","des gotzhus ze sant gallen, 33"
3,4,"phunt wienner phenning der wir, 41","dem gotzhus ze sant gallen, 27"
4,5,"ze haben vnd allen irn, 44","geben swem si wellen an, 43"
5,6,"vnd scherm fur alle ansprach, 46","mit recht noch an recht, 43"
6,7,"vnd scherm fur alle ansprach, 62","scherm fur alle ansprach als, 57"
7,8,"haben in dem lande ze, 98","daz wir haben in dem, 94"
8,9,"an disen brief der geben, 111","insigel gehenkt an disen brief, 84"
9,10,"jar dar nach in dem, 633","hundert jar darnach in dem, 529"


In [17]:
n = 6
percentiles = 10
k = 3

segmented_ngrams = get_segmented_ngrams(df_main, n=n, percentiles=percentiles, most_common=k)
segmented_ngrams_df = rename_columns(
    pd.DataFrame.from_dict(segmented_ngrams, orient="index")
    .applymap(modify_cell)
    )
segmented_ngrams_df.index = segmented_ngrams_df.index + 1
segmented_ngrams_df = segmented_ngrams_df.reset_index()
#segmented_ngrams_df.to_latex(f"../data-push/3-formulas/top_{n}_gram_segments_{k}_p_{str(percentiles)}.tex", index=False)
segmented_ngrams_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"segment_ngrams_{n}"] = df["text"].apply(segment_ngrams, n=n, percentiles=percentiles)


Unnamed: 0,index,1,2,3
0,1,"allen den die disen brief ansehent, 355","die disen brief ansehent lesent oder, 263","brief ansehent lesent oder horent lesen, 246"
1,2,"der zeit do wir iz wol, 78","der zeit do wir ez wol, 63","hant zu der zeit do wir, 52"
2,3,"von dem gotzhus ze sant gallen, 25",gestift oder vngestift versucht oder vnuersuch...,"vnuersucht swie so daz genant ist, 19"
3,4,"weder mit recht noch an recht, 20","mit allen rehten vnd mit aller, 19","vnd mit gut vnd mit allen, 18"
4,5,"geben swem si wellen an allen, 41","vnd geben swem si wellen an, 36","ze schaffen verchauffen versetzen vnd geben, 33"
5,6,"geben swem si wellen an allen, 33","gewern vnd scherm fur alle ansprach, 33","vnd scherm fur alle ansprach als, 33"
6,7,"recht ist vnd des landes recht, 41","vnd scherm fur alle ansprach als, 41","gewern vnd scherm fur alle ansprach, 39"
7,8,"wir haben in dem lande ze, 88","haben in dem lande ze osterreich, 70","daz wir haben in dem lande, 67"
8,9,"an disen brief der geben ist, 71","disen brief der geben ist ze, 50","vnd ze ainem waren vrkunde vnd, 45"
9,10,"do man zalt von gottes geburt, 442","an disen brief der geben ist, 401","in dem jar do man zalt, 352"


In [18]:
n = 7
percentiles = 10
k = 3

segmented_ngrams = get_segmented_ngrams(df_main, n=n, percentiles=percentiles, most_common=k)
segmented_ngrams_df = rename_columns(
    pd.DataFrame.from_dict(segmented_ngrams, orient="index")
    .applymap(modify_cell)
    )
segmented_ngrams_df.index = segmented_ngrams_df.index + 1
segmented_ngrams_df = segmented_ngrams_df.reset_index()
#segmented_ngrams_df.to_latex(f"../data-push/3-formulas/top_{n}_gram_segments_{k}_p_{str(percentiles)}.tex", index=False)
segmented_ngrams_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"segment_ngrams_{n}"] = df["text"].apply(segment_ngrams, n=n, percentiles=percentiles)


Unnamed: 0,index,1,2,3
0,1,"die disen brief ansehent lesent oder horent, 241","disen brief ansehent lesent oder horent lesen,...","allen den die disen brief ansehent lesent, 228"
1,2,"der zeit do wir ez wol getun, 43","zv der zeit do wir iz wol, 42","mit gesampter hant zu der zeit do, 42"
2,3,gestift oder vngestift versucht oder vnuersuch...,oder vngestift versucht oder vnuersucht swie s...,vngestift versucht oder vnuersucht swie so daz...
3,4,"mit lib vnd mit gut vnd mit, 17","fur mich vnd fur alle min erben, 17","recht vnd redleich verchauft vnd geben mit, 15"
4,5,"vnd geben swem si wellen an allen, 35","geben swem si wellen an allen irresal, 29","swem si wellen an allen irresal vnd, 26"
5,6,"gewern vnd scherm fur alle ansprach als, 25","recht gewern vnd scherm fur alle ansprach, 24","vnd geben swem si wellen an allen, 23"
6,7,"recht ist vnd des landes recht ze, 31","recht gewern vnd scherm fur alle ansprach, 29","gewern vnd scherm fur alle ansprach als, 28"
7,8,"wir haben in dem lande ze osterreich, 67","daz wir haben in dem lande ze, 64","allem vnserm gut daz wir haben in, 49"
8,9,"an disen brief der geben ist ze, 46","fur mich vnd fur alle min erben, 36","gehenkt an disen brief der geben ist, 32"
9,10,"in dem jar do man zalt von, 347","do man zalt von gottes geburt druzehenhundert,...","an disen brief der geben ist ze, 272"


# How are formulas distributed across time?

In [19]:
n = 3
k = 3

ngrams_decade = get_top_ngrams_by_decade(df_main, n=n, most_common=k)
ngrams_decade_df = pd.DataFrame.from_dict(ngrams_decade, orient="index").applymap(modify_cell).reset_index()
ngrams_decade_df["index"] = ngrams_decade_df["index"].astype(int)
#ngrams_decade_df.to_latex(f"../data-push/3-formulas/top_{n}_gram_decades.tex", index=False)
ngrams_decade_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decade"] = df["year"].apply(lambda x: (x // 10) * 10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ngrams"] = df["text"].apply(lambda x: get_top_ngrams(x.lower(), n, most_common))


Unnamed: 0,index,0,1,2
0,1130,"sol der selbig, 1","apt von sant, 1","von sant gallenn, 1"
1,1220,"heinrich der elter, 1","an unser mautt, 1","wir heinrich der, 1"
2,1230,"ain latinsch alte, 1","item ain latinsch, 1","latinsch alte collation, 1"
3,1250,"in deme namen, 2","von sante gallin, 1","vnser herre der, 1"
4,1260,"die leut von, 1","von dem heiligen, 1","ich alber der, 1"
5,1270,"uon sante gallen, 2","ze sante gallen, 2","in dem namen, 2"
6,1280,"dem gotshause von, 3","der probst von, 2","gotshause von frisingen, 2"
7,1290,"ich otte von, 5","ich chunrat von, 4","vnd ist daz, 4"
8,1300,"ich ott von, 6","von walsse vnd, 5","vnd allen irn, 5"
9,1310,"ze sant gallen, 19","allen die disen, 6","von sant gallen, 6"


In [20]:
n = 4
k = 3

ngrams_decade = get_top_ngrams_by_decade(df_main, n=n, most_common=k)
ngrams_decade_df = pd.DataFrame.from_dict(ngrams_decade, orient="index").applymap(modify_cell).reset_index()
ngrams_decade_df["index"] = ngrams_decade_df["index"].astype(int)
#ngrams_decade_df.to_latex(f"../data-push/3-formulas/top_{n}_gram_decades.tex", index=False)
ngrams_decade_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decade"] = df["year"].apply(lambda x: (x // 10) * 10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ngrams"] = df["text"].apply(lambda x: get_top_ngrams(x.lower(), n, most_common))


Unnamed: 0,index,0,1,2
0,1130,"es ist ain instrumennt, 1","apt von sant gallenn, 1","item es ist ain, 1"
1,1220,"der elter undt heinrich, 1","heinrich der elter undt, 1","wir heinrich der elter, 1"
2,1230,"item ain latinsch alte, 1","ain latinsch alte collation, 1","latinsch alte collation wie, 1"
3,1250,"vnsir herre der bishof, 1","die svne die vnser, 1","vnser herre der abbit, 1"
4,1260,"von dem heiligen chreutz, 1","die leut von den, 1","vber die leut von, 1"
5,1270,"auf daz guet daz, 1","mines herren des bishoffes, 1","minem herren dem bishof, 1"
6,1280,"allen den die disen, 3","dem gotshause von frisingen, 2","von sant andre herre, 2"
7,1290,"wir albrecht von gotes, 4","ich jans von ror, 3","den die disen brief, 3"
8,1300,"seinem chloster ze wilhering, 5","vnd seinem chloster ze, 5","ich vnd mein hovsvrowe, 4"
9,1310,"allen den die disen, 7","allen dien die disen, 7","allen die disen brief, 5"


In [21]:
n = 5
k = 3

ngrams_decade = get_top_ngrams_by_decade(df_main, n=n, most_common=k)
ngrams_decade_df = pd.DataFrame.from_dict(ngrams_decade, orient="index").applymap(modify_cell).reset_index()
ngrams_decade_df["index"] = ngrams_decade_df["index"].astype(int)
#ngrams_decade_df.to_latex(f"../data-push/3-formulas/top_{n}_gram_decades.tex", index=False)
ngrams_decade_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decade"] = df["year"].apply(lambda x: (x // 10) * 10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ngrams"] = df["text"].apply(lambda x: get_top_ngrams(x.lower(), n, most_common))


Unnamed: 0,index,0,1,2
0,1130,"item es ist ain instrumennt, 1","es ist ain instrumennt gemacht, 1","ist ain instrumennt gemacht vnd, 1"
1,1220,"der elter undt heinrich der, 1","wir heinrich der elter undt, 1","heinrich der elter undt heinrich, 1"
2,1230,"latinsch alte collation wie herr, 1","ain latinsch alte collation wie, 1","item ain latinsch alte collation, 1"
3,1250,"die svne die vnser herre, 1","herre der abbit berchtolt von, 1","vnser herre der abbit berchtolt, 1"
4,1260,"ich alber der sthuchs von, 1","auf der herren guet daz, 1","alber der sthuchs von trautmansdorf, 1"
5,1270,"in dem namen vnsers herren, 2","alheit von reinsperch witeb hern, 1","ich alheit von reinsperch witeb, 1"
6,1280,"gottes namen amen alle die, 3","in gottes namen amen alle, 3","allen den die disen brief, 2"
7,1290,"allen den die disen brief, 4","von volkenstorf vergihe an disem, 2","ich chvnrat von volkenstorf vergihe, 2"
8,1300,"vnd seinem chloster ze wilhering, 6","walsse vnd allen sinen erben, 3","von walsse vnd allen sinen, 3"
9,1310,"dien die disen gegenwrtigen brief, 6","allen dien die disen gegenwrtigen, 6","wir von gottes genaden abt, 4"
