In [1]:
import sys
sys.path.append('../')
from lib.people import *
from lib.standardization import * 
from lib.citation_info import * 

In [1]:
v_to_p = {}
for word, variants in people.items(): 
    v_to_p[word] = word 
    for v in variants: 
        v_to_p[v] = word 

outputs = [] 
with open('../assets/sermons_marginalia.csv', 'r') as file:          
    notes = csv.reader(file, delimiter=',')
    for idx, entry in enumerate(notes):
        # output dictionary 
        info_dict = {'idx':idx, 'tcpID':entry[0],'citations':None,'original':entry[-1]}
        # get note text 
        n = entry[-1]
        n = clean_text(n)
        note = n.split(" ")
        matches = []
        match = []
        for i, word in enumerate(note):
            word = clean_word(word) 
            if word in v_to_p: 
                if len(match) > 0: matches.append(" ".join(match).strip())
                match = [v_to_p[word]]
            elif len(match) > 0 and len(match) < 10: 
                match.append(word)
        if len(match) > 0: 
            matches.append(" ".join(match).strip())
        info_dict["citations"] = "; ".join(matches)
        
        if len(info_dict["citations"]) > 0: 
            outputs.append(info_dict)
        if (idx+1) % 100000 == 0: 
            print(f"Processed {idx+1} entries")

outfile = open("../outputs/tertullian.csv","w+")
writer = csv.DictWriter(outfile, fieldnames=["idx","tcpID","citations","original"])
writer.writeheader()
for dict in outputs: 
    writer.writerow(dict)
outfile.close()

Processed 100000 entries
Processed 200000 entries
Processed 300000 entries
Processed 400000 entries
Processed 500000 entries
Processed 600000 entries


In [3]:
citations = get_citations('../outputs/tertullian.csv')
found = {tcpID: 0 for tcpID in citations}
positions = {}
for tcpID, c_list in citations.items():
    positions[tcpID] = [] 
    for cited in c_list:
        pos = cited[1]
        cited = cited[0].split("; ")
        for c in cited:
            found[tcpID] += 1 
            positions[tcpID].append(pos)
sorted_counts = sorted(found.items(), key = lambda x : x[1], reverse=True)
print(len(found))
for tcpID, count in sorted_counts[:11]:
    print(count, "hits in the notes of", tcpID, info[tcpID]) 

824
101 hits in the notes of A40891 {'title': 'XXX sermons lately preached at the parish church of Saint Mary Magdalen Milkstreet, London to which is annexed, A sermon preached at the funerall of George Whitmore, Knight, sometime Lord Mayor of the City / by Anthony Farindon.', 'author': 'Farindon, Anthony, 1598-1658.', 'pubplace': 'London', 'subject_headings': 'Whitmore, George,; Sir, d. 1654.; Sermons, English; 17th century.; Funeral sermons.', 'date': '1647'}
72 hits in the notes of B27417 {'title': 'Ekthesis pisteōs, or, An exposition of the Apostles Creed delivered in several sermons by William Nicholson ...', 'author': 'Nicholson, William, 1591-1672.', 'pubplace': 'London', 'subject_headings': "Apostles' Creed; Sermons.; Sermons, English; 17th century.", 'date': '1661'}
55 hits in the notes of A44334 {'title': 'The works of Mr. Richard Hooker (that learned and judicious divine), in eight books of ecclesiastical polity compleated out of his own manuscripts, never before published 

In [3]:
outputs = []
for idx,tcpID in enumerate(found): 
    notes_contexts = context(tcpID, positions)
    for sourceline, c_dict in notes_contexts.items():
        for sourcepos, sentence in c_dict.items(): 
            outputs.append({"tcpID":tcpID, 
                            "sourceline":sourceline,
                            "sourcepos":sourcepos,
                            "context": sentence})
    if (idx+1) % 100 == 0: print(f"Processed {idx+1} texts")

outfile = open("../outputs/tertullian_contexts.csv","w+")
writer = csv.DictWriter(outfile, fieldnames=["tcpID","sourceline","sourcepos","context"])
writer.writeheader()
for dict in outputs: 
    writer.writerow(dict)
outfile.close()



Processed 100 texts
Processed 200 texts
Processed 300 texts
Processed 400 texts
Processed 500 texts
Processed 600 texts
Processed 700 texts
Processed 800 texts


In [5]:
outfile = open("../outputs/tertullian_contexts.csv","w+")
writer = csv.DictWriter(outfile, fieldnames=["tcpID","sourceline","sourcepos","context"])
writer.writeheader()
for dict in outputs: 
    writer.writerow(dict)
outfile.close()

Find variants 

In [18]:
import json
from Levenshtein import distance 
from difflib import SequenceMatcher

target = ['tertullian','ter','tert','tertul']

with open(f"../assets/lemmas.json","r") as file: 
    lemma_dict = json.load(file)

possible_matches = {k:[] for k in target}

for lemma in sorted(lemma_dict):
    if lemma not in possible_matches: 
        for word in target:
            edit_dist = distance(word, lemma)
            similarity = SequenceMatcher(None, word, lemma).ratio()
            if similarity >= 0.8 and edit_dist < len(word):
                possible_matches[word].extend(list(lemma_dict[lemma].keys()))
                break
possible_matches

In [1]:
import json
from Levenshtein import distance 
from difflib import SequenceMatcher
import sys
sys.path.append('../')

target = ['azariah']

with open(f"../assets/nouns_ab.json","r") as file: 
    lemma_dict = json.load(file)

possible_matches = {k:[] for k in target}

for lemma in sorted(lemma_dict):
    if lemma not in possible_matches: 
        for word in target:
            edit_dist = distance(word, lemma)
            similarity = SequenceMatcher(None, word, lemma).ratio()
            if similarity >= 0.8 and edit_dist < len(word):
                possible_matches[word].extend(list(lemma_dict[lemma].keys()))
                break
possible_matches

{'azariah': ['amariah',
  'amariath',
  'azaria',
  'azariab',
  'azarias',
  'hazaria',
  'jzariah',
  'nazarath',
  'samariah',
  'z*cariah',
  'zarah',
  'zaruiah',
  'zecariah']}