In [1]:
from google.colab import drive
drive.mount('/content/drive')
folder = "/content/drive/MyDrive/DH"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sentence-transformers matplotlib scipy scikit-learn



In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

exclude = ["Sermons, English.",
           "Sermons,English",
           "Sermons.",
           "Early works to 1800.",
           "16th century.",
           "17th century.",
           "-- Sermons -- Early works to 1800."]
exclude = re.compile("|".join(exclude))
def read_data(file):
  sermon_metadata = pd.read_csv(file)
  pubyears = sermon_metadata['date']
  years = []
  for i, year in enumerate(pubyears):
      if "-" in year:
          year = year.split("-")[0] # earliest possible pub year
          years.append(year)
  subject_headings = sermon_metadata['subject_headings']
  titles = list(sermon_metadata['title'])
  tcpIDs = sermon_metadata['id']
  info_dict = {}
  for idx, t in enumerate(titles): # place subject headings at the end of titles
      if isinstance(subject_headings[idx],str):
          subjects = []
          for s, subject in enumerate(subject_headings[idx].split("; ")):
            subject = re.sub(r"\s+"," ",subject)
            subject = re.sub(exclude,"",subject).strip()
            if subject == 'English': continue
            if not re.search(r"[A-Za-z]",subject): continue
            subjects.append(subject)
          title_subjects = f"{t} {'; '.join(subjects)}"
          # title_subjects = re.sub("--"," ",title_subjects)
          # title_subjects = re.sub("\s+"," ",title_subjects)
          t = title_subjects
      t = [word for word in t.split() if word.lower() not in stop_words and not re.search("sermon|preach",word.lower())]
      info_dict[tcpIDs[idx]] = " ".join(t)
  return info_dict

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
docs = read_data(f'{folder}/Early-Modern-Sermons/assets/sermons.csv')
missing_docs = read_data(f'{folder}/Early-Modern-Sermons/assets/sermons_missing.csv')
docs.update(missing_docs)
len(docs)

5729

In [None]:
corpus = list(docs.values())
ids = list(docs.keys())
len(corpus)

5729

In [3]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util, models
from sentence_transformers.util import semantic_search, pytorch_cos_sim

In [4]:
bi_encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
!pip install unidecode
from unidecode import unidecode
import unicodedata
import re, json
import html, tqdm

def normalize_text(text):
    text = html.unescape(text)
    text = unicodedata.normalize('NFKD', text)
    text = unidecode(text)
    text = re.sub(r"\s+", " ", text)
    return text
def clean_text(s):
  s = normalize_text(s)
  s = re.sub(r"</i>|<NOTE>|NONLATINALPHABET|<i>"," ",s) # \d+\^PAGE[S]*\^MISSING"
  s = re.sub(r"\s+"," ",s)
  return s.strip(" ")

outfname,inputfname = "M_QP", f"{folder}/segments/all_unique_marginalia.json"
with open(inputfname,"r") as file:
  texts = json.load(file)
temp = {}
for t, meta in tqdm.tqdm(texts.items()):
  t = clean_text(t)
  if t not in temp: temp[t] = []
  temp[t].extend(meta)
texts = temp
len(texts)



100%|██████████| 435906/435906 [00:18<00:00, 23296.70it/s]


415627

In [6]:
list(texts.items())[0]

('i. cor. xv',
 [['A73502', '7', 'Note 0'],
  ['A73502', '22', 'Note 0'],
  ['A73502', '39', 'Note 0'],
  ['A09915', '122', 'Note 0'],
  ['A16087', '7', 'Note 0'],
  ['A16087', '21', 'Note 0'],
  ['A16087', '39', 'Note 0'],
  ['A16945', '73', 'Note 0'],
  ['A16945', '591', 'Note 0']])

In [9]:
import torch
vectors = bi_encoder.encode(list(texts.keys()),batch_size=256, convert_to_tensor=True, show_progress_bar=True)
torch.save(vectors, f'{folder}/EEPS/all_margins_all-mpnet-base-v2.pt')

Batches:   0%|          | 0/1624 [00:00<?, ?it/s]

In [10]:
from google.colab import runtime
runtime.unassign()

In [None]:
model_checkpoint = 'emanjavacas/MacBERTh'
word_embedding_model = models.Transformer(model_checkpoint, max_seq_length=128)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "mean")
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
title_vec = bi_encoder.encode(corpus,convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/180 [00:00<?, ?it/s]

In [None]:
import torch
# torch.save(title_vec, f'{folder}/EEPS/titles_all-mpnet-base-v2.pt')
title_vec = torch.load(f'{folder}/EEPS/titles_all-mpnet-base-v2.pt')

In [None]:
query = ['funeral','marriage','civil war 1649']
q_embedding = bi_encoder.encode(query,convert_to_tensor=True)
hits = semantic_search(q_embedding, title_vec)
for idx, hitlist in enumerate(hits):
  print(query[idx])
  for hit in hitlist:
    print(hit['score'], corpus[hit['corpus_id']])
  print()

funeral
0.6675542593002319 mourners companion, or, Funeral discourses several texts John Shower. -- Funeral -- English
0.6427714824676514 mourning-ring, memory departed friend ... Mourning customs.; Laments.; Funeral
0.629880964756012 love-token mourners teaching spiritual dumbness submission Gods smarting rod : two funeral / Samuel Fisher M.A., late Brides London, Thornton Cheshire ; unto added, antidote fear death, meditations author time place great mortality. Funeral
0.6243175268173218 mans funeral. Lately delivered Chelsey, several persons honour worship. Thomas Fuller.
0.6104832887649536 Tvvo funeral much one subiect; wit, benefit death. former Philip. 1. 23. latter Eccles. 7. 1. Thomas Gataker B. D. pastor Rotherhith. Crisp, Rebekka.; Funeral
0.6086437702178955 Best choyce funerall / published desire friends dead. Bible.; N.T.; Philippians I, 23; Funeral
0.5924355983734131 funeral incomparable lady Honourable Lady Mary Armyne J.D., M.A. epistle elegy two grave divines. -- Lady, 

In [None]:
query = ['funeral','marriage','civil war 1649']
q_embedding = bi_encoder.encode(query,convert_to_tensor=True)
hits = semantic_search(q_embedding, title_vec)
for idx, hitlist in enumerate(hits):
  print(query[idx])
  for hit in hitlist:
    print(hit['score'], corpus[hit['corpus_id']])
  print()

funeral
0.8961145877838135 bridegroome Samuel Hieron
0.863551139831543 Gods omnipotencie prouidence
0.8108749389648438 lately Corinth. 3.15. reverend divine Church England
0.808027982711792 Three concerning sacred Trinity John Wallis. Trinity
0.8057557940483093 [The examination vsury two
0.8038545846939087 Select Dr. Whichcot [sic] two parts. Church England
0.8026604056358337 several occasions John Conant.
0.7992631793022156 [The dreadfull day dolorous wicked. two William Leigh.] Judgment Day;
0.7984390258789062 XXIX severall texts Scripture William Fenner. Church England
0.7976254820823669 die Innocencium sermo pro episcopo puerorum Boy bishops; 15th century.

marriage
0.8943576812744141 bridegroome Samuel Hieron
0.8649606108665466 Gods omnipotencie prouidence
0.8089074492454529 Three concerning sacred Trinity John Wallis. Trinity
0.8053290843963623 lately Corinth. 3.15. reverend divine Church England
0.8009976744651794 Select Dr. Whichcot [sic] two parts. Church England
0.80098330974

In [None]:
representation_model = KeyBERTInspired()
model = BERTopic(n_gram_range=(1,4),representation_model=representation_model,embedding_model='emanjavacas/MacBERTh')

In [None]:
corpus[:12]

['extempore upon malt, way caution good fellows; request two schollars, / lover ale, hallow [sic] tree. Temperance; Great Britain; Alcoholism; Great Britain; Broadsides; England; Broadsides; Scotland',
 'Englands Iliads nut-shell. Or, briefe chronologie battails, sieges, conflicts, remarkable passages beginning rebellion, 25. March, 1645. Great Britain; History; Civil War, 1642-1649.',
 "Truth prevailing fiercest opposition vindication Dr. Russel's True narrative Portsmouth disputation ... Also, upon Mat. 28. 19. Mr. John Williams ... also answer Presbyterian dialogue, another hand / published Mr. John Sharp ... moderator disputation Portsmouth. Russel, William, d. 1702.; True narrative Portsmouth disputation.; Infant baptism.",
 'true relation conversion baptism Isuf Turkish chaous, named Richard Christophilus presence full congregation, Jan. 30. 1658. Covent-Garden, Mr. Manton minister. Imprimatur, Edm. Calamy. Christophilus, Richard; Manton, Thomas, 1620-1677; Converts; Christian co

In [None]:
topics, probs = model.fit_transform(corpus)
# Save topic group for each tcpID
topics_df = pd.DataFrame({"tcpID": ids, "topic": topics, "probability":probs})




In [None]:
# Save topic words for each group
topic_words = model.get_topic_info().to_dict(orient='records')
idx_to_topic = []
for entry in topic_words:
  idx_to_topic.append({"topic_idx":entry['Topic'], "num_docs": entry['Count'], "topics": entry['Representation']})
topic_words_df = pd.DataFrame(idx_to_topic)
topic_words_df

Unnamed: 0,topic_idx,num_docs,topics
0,-1,2529,"[bible psalms, church england, lord bishop, ri..."
1,0,407,"[funeral mrs, funeral death, death funeral, re..."
2,1,313,"[christian life, jesus christ, minister gospel..."
3,2,162,"[one assembly divines bible, one assembly divi..."
4,3,159,"[ii king england 1630, king england 1650 1702,..."
5,4,157,"[catholic church controversial literature, chu..."
6,5,108,"[history civil war 1642, civil war 1642 1649, ..."
7,6,97,"[christian life bible, criticism interpretatio..."
8,7,95,"[christian life, devotional literature, substa..."
9,8,92,"[lord mayor city london, lord mayor court alde..."


In [None]:
topics_df.to_csv(f'{folder}/topics.csv', index=False,header=False)
topic_words_df.to_csv(f"{folder}/topic_words.csv",header=False,index=False)

# Get topic clusters for each era

In [None]:
import json
with open(f'{folder}/Early-Modern-Sermons/assets/corpora.json','r') as file:
        corpora = json.load(file)

era_tcpIDs = {era:{} for era in corpora}
for era in corpora:
    for prefix,ids in corpora[era].items():
        for tcpID in ids:
            era_tcpIDs[era][tcpID] = docs[tcpID]

In [None]:
def get_topics():
  representation_model = KeyBERTInspired()
  model = BERTopic(n_gram_range=(1,4),representation_model=representation_model,embedding_model='emanjavacas/MacBERTh') # all-mpnet-base-v2
  corpus = list(era_tcpIDs[era].values())
  ids = list(era_tcpIDs[era].keys())
  topics, probs = model.fit_transform(corpus)
  # Save topic group for each tcpID
  topics_df = pd.DataFrame({"tcpID": ids, "topic": topics, "probability":probs})
  topics_df.to_csv(f'{folder}/{era}_topics.csv', index=False,header=False)
  # Save topic words for each group
  topic_words = model.get_topics()
  idx_to_topic = []
  for idx, topiclist in topic_words.items():
    idx_to_topic.append({"topic_idx":idx, "topics": "; ".join([t[0] for t in topiclist])})
  topic_words_df = pd.DataFrame(idx_to_topic)
  topic_words_df.to_csv(f"{folder}/{era}_topic_words.csv",header=False,index=False)
  return model

In [None]:
preE = get_topics("pre-Elizabethan")
preE.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,61,-1_sermons of_sacrament_sermon of_sermons,"[sermons of, sacrament, sermon of, sermons, re...",[A sermon had at Paulis by the co[m]mandment o...
1,0,12,0_sermon annexed_sermon annexed vnto it_sermon...,"[sermon annexed, sermon annexed vnto it, sermo...","[The copie of a letter, sent to the ladye Mary..."
2,1,10,1_sermon of_preached before the_he preached be...,"[sermon of, preached before the, he preached b...",[A notable sermon concerninge the ryght vse of...


In [None]:
E = get_topics("Elizabethan")
E.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,16,-1_true martyrs of christ_true martyrs of_mart...,"[true martyrs of christ, true martyrs of, mart...",[The effect of certaine sermons touching the f...
1,0,268,0_sermons of_sermon preached at paules_sermon ...,"[sermons of, sermon preached at paules, sermon...",[A sermon preached in S. Peters Church in Exce...
2,1,18,1_controversial literature catholic church_cat...,"[controversial literature catholic church, cat...","[A copie of a challenge, taken owt [sic] of th..."
3,2,18,2_of the lords supper_the sacrament of the_of ...,"[of the lords supper, the sacrament of the, of...",[A setting open of the subtyle sophistrie of T...


In [None]:
J = get_topics("Jacobean")
J.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,333,-1_in sermon_preached at pauls crosse_in sermo...,"[in sermon, preached at pauls crosse, in sermo...",[The delights of the saints A most comfortable...
1,0,56,0_sermon preached at_sermons preached_fruitful...,"[sermon preached at, sermons preached, fruitfu...","[The way to glory, or, The preaching of the Go..."
2,1,43,1_church controversial literature_the church o...,"[church controversial literature, the church o...","[The height of Israels heathenish idolatrie, i..."
3,2,39,2_sermon preached before the_preached before t...,"[sermon preached before the, preached before t...",[A sermon preached in the cathedrall church of...
4,3,30,3_the epistle of saint_the epistle of_epistle ...,"[the epistle of saint, the epistle of, epistle...",[The triumph of a true Christian described: or...
5,4,30,4_1614 funeral sermons_funeral sermons the_fun...,"[1614 funeral sermons, funeral sermons the, fu...",[Sinnelesse sorrow for the dead a comfortable ...
6,5,26,5_literature sermon against_two sermons_sermon...,"[literature sermon against, two sermons, sermo...",[The yong mans gleanings Gathered out of diuer...
7,6,25,6_in sermon_of sicknesse_two sermons_in sermon...,"[in sermon, of sicknesse, two sermons, in serm...",[A posie of spirituall flowers taken out of th...
8,7,21,7_funerall sermon preached_the funerall of the...,"[funerall sermon preached, the funerall of the...",[A verie godlie and learned sermon treating of...
9,8,18,8_by the bishop of_the bishop of elie_preached...,"[by the bishop of, the bishop of elie, preache...",[A sermon preached before his Maiestie at Whit...


In [None]:
C = get_topics("Carolinian")
C.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,276,-1_in sermon_sermon preached at_sermons_sermon...,"[in sermon, sermon preached at, sermons, sermo...",[The saints qualification: or A treatise I. Of...
1,0,72,0_charles 1625_sermon preached at pauls_1625 1...,"[charles 1625, sermon preached at pauls, 1625 ...",[An exposition of the third chapter of the Epi...
2,1,40,1_sermons on_sermons preached upon_sermons pre...,"[sermons on, sermons preached upon, sermons pr...",[Choice sermons preached upon selected occasio...
3,2,38,2_sermons upon_sermons_severall sermons_sermon,"[sermons upon, sermons, severall sermons, serm...",[Evangelicall sacrifices In xix. sermons. I. T...
4,3,35,3_the funerall of the_funeral sermons the_fune...,"[the funerall of the, funeral sermons the, fun...","[A lasting ievvell, for religious woemen In th..."
5,4,28,4_church of england_of the church_reverend fat...,"[church of england, of the church, reverend fa...","[The Christian divinitie, contained in the div..."
6,5,16,5_of his maiesties chaplaines_maiesties chapla...,"[of his maiesties chaplaines, maiesties chapla...",[A royall edict for military exercises publish...
7,6,15,6_of the lords supper_the lords supper_sacrame...,"[of the lords supper, the lords supper, sacram...",[The saints daily exercise A treatise concerni...
8,7,14,7_fast day sermons_one of the sermons_of the s...,"[fast day sermons, one of the sermons, of the ...",[A sermon preached before the Honourable House...
9,8,13,8_by plague england london_of the plague_plagu...,"[by plague england london, of the plague, plag...","[A sermon intended for Paul's Crosse, but prea..."


In [None]:
CW = get_topics("CivilWar")
CW.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,206,-1_delivered in sermon_sermon preached before ...,"[delivered in sermon, sermon preached before t...",[The Christians hope triumphing in these glori...
1,0,172,0_fast day sermons the_fast day sermons_solemn...,"[fast day sermons the, fast day sermons, solem...",[A firebrand pluckt out of the burning. A serm...
2,1,89,1_war 1642_civil war 1642_war 1642 1649_histor...,"[war 1642, civil war 1642, war 1642 1649, hist...",[Ioseph paralled [sic] by the present Parliame...
3,2,38,2_funeral sermons sermon_funeral sermons the_f...,"[funeral sermons sermon, funeral sermons the, ...","[The life of faith in death, in expectation of..."
4,3,36,3_of the gospel_grace theology_the gospel_theo...,"[of the gospel, grace theology, the gospel, th...",[Practicall divinity: or a helpe through the b...
5,4,30,4_sermon preached at the_sermon preached in_se...,"[sermon preached at the, sermon preached in, s...","[A touch-stone, or, Triall and examination of ..."
6,5,24,5_britain church history_great britain church ...,"[britain church history, great britain church ...","[The sacred and soveraigne church-remedie: or,..."
7,6,21,6_the covenant of grace_the covenant of_covena...,"[the covenant of grace, the covenant of, coven...","[Religious covenanting directed, and covenant-..."
8,7,11,7_delivered in sermon by_delivered in sermon_o...,"[delivered in sermon by, delivered in sermon, ...",[A seasonable sermon for these vnseasonable ti...
9,8,10,8_house of commons bible_of that house bible_i...,"[house of commons bible, of that house bible, ...","[The noble order, or The honour which God conf..."


In [None]:
IR = get_topics("Interregnum")
IR.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,203,-1_of the gospel_in sermon_in sermon preached_...,"[of the gospel, in sermon, in sermon preached,...",[Saul smitten for not smiting Amalek according...
1,0,105,0_funeral sermons sermon_funeral sermons the_i...,"[funeral sermons sermon, funeral sermons the, ...","[The epitaph of a godly man, especially a man ..."
2,1,60,1_sermons preached upon_sermons on_ordination ...,"[sermons preached upon, sermons on, ordination...","[Five sermons, in five several styles; or Waie..."
3,2,47,2_of the gospel_the gospel_sermons preached by...,"[of the gospel, the gospel, sermons preached b...",[A briefe discourse touching a broken heart In...
4,3,46,3_bible corinthians_in sermon_sermon preached ...,"[bible corinthians, in sermon, sermon preached...","[The vanity and mischief of making earthly, to..."
5,4,45,4_1656_1649_1651_sermon preached before the,"[1656, 1649, 1651, sermon preached before the,...",[The true speech delivered on the scaffold by ...
6,5,22,5_the magistrates_preached before the judges_c...,"[the magistrates, preached before the judges, ...",[The judges charge; delivered in a sermon befo...
7,6,17,6_doctrine of justification_justification by f...,"[doctrine of justification, justification by f...",[Confidence dismounted. Or a letter to Mr Rich...
8,7,15,7_great britain church history_of the church_c...,"[great britain church history, of the church, ...","[Provocator provocatus. Or, An answer made to ..."
9,8,14,8_of the lords supper_the lords supper_lords s...,"[of the lords supper, the lords supper, lords ...",[Two treatises: I. The saints communion with J...


In [None]:
CII = get_topics("CharlesII")
CII.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,547,-1_in sermon preached_sermon preached at_in se...,"[in sermon preached, sermon preached at, in se...",[Ebdomas embolimaios a supplement to the eniau...
1,0,210,0_funeral sermons sermon_funeral sermons sermo...,"[funeral sermons sermon, funeral sermons sermo...",[The life and death of the godly man exemplifi...
2,1,144,1_church of england bible_of england bible_pre...,"[church of england bible, of england bible, pr...",[A sermon preached before the Right Honourable...
3,2,72,2_dissenters religious england_church controve...,"[dissenters religious england, church controve...",[The tryal of spirits both in teachers & heare...
4,3,55,3_some sermons_of the gospel_of the gospel at_...,"[some sermons, of the gospel, of the gospel at...","[Great and precious promises; or, Some sermons..."
5,4,52,4_1660 1688_sermon preached before the_preache...,"[1660 1688, sermon preached before the, preach...",[Mercy in the midst of judgment by a gracious ...
6,5,51,5_fast day sermons sermon_fast day sermons_the...,"[fast day sermons sermon, fast day sermons, th...","[A sermon preached on the fast-day, November 1..."
7,6,46,6_september 1683 being_september 1683_gunpowde...,"[september 1683 being, september 1683, gunpowd...",[A sermon preach'd before the King in the Cath...
8,7,42,7_of england sermons_church of england sermons...,"[of england sermons, church of england sermons...",[Fifty sermons preached at the parish-church o...
9,8,41,8_history charles ii 1660_charles ii 1660 1685...,"[history charles ii 1660, charles ii 1660 1685...","[God save the King: or, A sermon preach'd at L..."


In [None]:
JII = get_topics("JamesII")
JII.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8,-1_fast day sermons_the year 1666_the year 166...,"[fast day sermons, the year 1666, the year 166...","[Londons remembrancer, or, A sermon preached a..."
1,0,235,0_sermon preached before the_sermon preached a...,"[sermon preached before the, sermon preached a...",[A vindication of a passage in Dr. Sherlock's ...
2,1,21,1_rebellion 1685_of july 1685_26th of july 168...,"[rebellion 1685, of july 1685, 26th of july 16...",[The character of a rebel a sermon preached at...


In [None]:
WM = get_topics("WilliamAndMary")
WM.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,422,-1_sermon preached before the_preached before ...,"[sermon preached before the, preached before t...",[A sermon preached before the Right Honourable...
1,0,155,0_1691 funeral sermons_funeral sermons sermon_...,"[1691 funeral sermons, funeral sermons sermon,...",[A funeral-sermon upon occasion of the death o...
2,1,117,1_of england 1650 1702_of england 1650_england...,"[of england 1650 1702, of england 1650, englan...",[A sermon preach'd before the Honourable House...
3,2,53,2_of england sermons_church of england sermons...,"[of england sermons, church of england sermons...",[Twelve sermons preached on several occasions....
4,3,42,3_fast day sermons sermon_fast day sermons_the...,"[fast day sermons sermon, fast day sermons, th...",[A sermon preach'd before the honourable House...
5,4,41,4_the reformation of manners_reformation of ma...,"[the reformation of manners, reformation of ma...",[A sermon preach'd to the societies for reform...
6,5,32,5_1662 1694 funeral sermons_1694 funeral sermo...,"[1662 1694 funeral sermons, 1694 funeral sermo...",[A sermon preach'd at the chappel royal in the...
7,6,28,6_of england bible_church of england bible_bis...,"[of england bible, church of england bible, bi...",[A sermon preached before the King & Queen at ...
8,7,26,7_charity sermon_charity given to_of charity_c...,"[charity sermon, charity given to, of charity,...",[Publick charity a sermon preached before the ...
9,8,24,8_scripture justification_justification christ...,"[scripture justification, justification christ...",[Christ's righteousness a believer's surest pl...


# Stratified Sampling

In [None]:
import pandas as pd
import json
with open(f'{folder}/corpora.json','r') as file:
        corpora = json.load(file)

In [None]:
era_counts = {era:0 for era in corpora}
for era in corpora:
    for prefix,ids in corpora[era].items():
          era_counts[era] += len(ids)
total = sum(era_counts.values())
for era, count in era_counts.items():
    era_counts[era] = round((count / total) * 300)
era_counts

{'pre-Elizabethan': 4,
 'Elizabethan': 16,
 'Jacobean': 35,
 'Carolinian': 28,
 'CivilWar': 33,
 'Interregnum': 29,
 'CharlesII': 78,
 'JamesII': 14,
 'WilliamAndMary': 63}

In [None]:
samples = {}
for era in era_counts:
  data = pd.read_csv(f"{folder}/{era}_topics.csv",header=None,names=['tcpID','group'],usecols=[0,1])
  num_groups = len(data['group'].unique())
  total = 22 # era_counts[era]
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  samples[era] = list(sorted(sample['tcpID']))
  print(era)
  print(sample)

pre-Elizabethan
     tcpID  group
2   B00958     -1
35  A06329     -1
79  A68325     -1
61  A19734     -1
26  A02882     -1
14  A07584     -1
56  A17636     -1
8   A07230      0
42  A04512      0
5   A06508      0
47  A04511      0
6   A00609      0
46  A08050      0
19  A02883      0
16  A09915      1
27  A05394      1
10  A07260      1
28  A05142      1
18  A05141      1
59  A19143      1
22  A05158      1
Elizabethan
      tcpID  group
254  A12386     -1
313  A72347     -1
248  A16144     -1
171  A14350     -1
310  A73748     -1
1    B15274     -1
228  A17318      0
169  A11247      0
288  A69056      0
136  A17722      0
45   A05404      0
103  A04416      0
12   B07515      1
121  A09101      1
133  A07105      1
233  A19272      1
304  A68078      1
55   A08197      1
145  A10233      2
294  A68376      2
204  A12345      2
305  A68172      2
148  A17190      2
79   A03909      2
Jacobean
      tcpID  group
297  A01700     -1
57   A03419      0
169  A03335      1
638  A22051     

  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))
  sample = data.groupby('group', group_keys=False).apply(lambda x: x.sample(round(total/num_groups)))


In [None]:
with open(f"{folder}/samples.json","w+") as f:
    json.dump(samples,f)