Find possible instances of scriptural and apocryphal citations 

In [51]:
import sys 
sys.path.append('../')
from lib.standardization import * 

In [55]:
# Takes 50 seconds 
outputs = []
with open('../assets/sermons_marginalia.csv', 'r') as file:          
    notes = csv.reader(file, delimiter=',')

    expected_outliers = []
    citations, outliers = [],[]
    for idx, entry in enumerate(notes):
        # output dictionary 
        info_dict = {'idx':idx, 'tcpID':entry[0],'citations':None, 'outliers':None,'original':entry[-1]}
        # get note text 
        n = entry[-1]
        
        # replace all periods with spaces and convert to lower case 
        # Some citations are originally inconsistently formatted as "<book> <chapter>.<verse>" at times 
        # and "<book> <chapter>. <verse>." at other times, so replacing periods with spaces is a must 
        n = re.sub(r'(\.)',r' ',n).lower()
        # normalize ampersands and conjunctions 
        n = re.sub(r"\band\b|\b&ampc\b|\b&amp\b|\bet\b",'&', n)
        # remove everything that is not an alphabetical character, integer, comma, ampersand, hyphen illegible char, or a single space
        n = re.sub(r'[^a-z0-9\,\&\-\—\*\•\▪ ]','',n)
        # replace 
        n = re.sub(r'[\•\▪]','*',n)
        # replace all instances of multiple white spaces with a single space. 
        n = re.sub(r'\s+',' ',n)
        
        # dealing with c., v., ver., chap., cap.  
        # A02456,sermon,70,168,Deut. 12. 5.6. &amp; 7. v 16. c. 14 v. 23. c. 16. v. 2.11. c. 31. v. 11.
        # first store elsewhere and deal with it later 
        if re.search(r'\bc\b|\bv\b|\bver\b|\bchap\b|\bcap\b',n) and re.search(r'[^a-z\&\,]+',n): 
            expected_outliers.append((idx,n))
            outputs.append(info_dict)
            continue 

        # find possible citations 
        n = replaceBook(n)
        match = re.findall(rf'([a-z*]+ [^a-z\&]+)', n)
        c,o = [],[]
        if len(match) > 0: 
            for item in match:
                item = item.strip(" ")
                book = item.split(" ")[0]
                if book not in abbrev and book not in numBook.values():
                    o.append(item) 
                else: 
                    decomposed = findCitations(item)
                    if not decomposed: continue
                    c.extend(decomposed[0])
                    o.extend(decomposed[1])
                    # print(idx,decomposed)
            info_dict['citations'] = "; ".join(c) 
            info_dict['outliers'] = "; ".join(o)
            outputs.append(info_dict)
            citations.extend(c)
            outliers.extend(o)
        if (idx+1) % 100000 == 0: 
            print(f"Processed {idx+1} entries")

outfile = open("../outputs/citations.csv","w+")
writer = csv.DictWriter(outfile, fieldnames=["idx","tcpID","citations","outliers","original"])
writer.writeheader()
for dict in outputs: 
    writer.writerow(dict)
outfile.close()

Processed 100000 entries
Processed 200000 entries
Processed 300000 entries
Processed 400000 entries
Processed 500000 entries
Processed 600000 entries


In [57]:
from collections import Counter
from difflib import SequenceMatcher
possible_abbrev = []
for o in outliers: 
    possible_abbrev.append(o.split(" ")[0]) 

possible_abbrev = Counter(possible_abbrev).items()
possible_matches = {k:[] for k in abbrev}
for word,freq in possible_abbrev:
    # print(word)
    # finding possible matches to any of the known abbreviations for each book
    if word not in abbrev_to_book: 
        for ab, book in abbrev_to_book.items():
            similarity = SequenceMatcher(None, ab, word).ratio()
            if similarity >= 0.8:
                possible_matches[book].append((word,freq))
                break

In [58]:
clean_format = []
for book, variants in possible_matches.items(): 
    for v,freq in variants:
        clean_format.append(f"{book}: {v} {freq}\n") 
    clean_format.append("\n")
with open(f"../assets/possible_variants_2.txt","w+") as file: 
    file.writelines(clean_format)

Problems already observed 
- Non-Biblical texts 
    - Aug epist/ep (Augustine's Epistles)
    - 169468,A30450,Epistle 49,,Jul. Ep. 49.
- problems with ampersands
    - 1006,A16985,Deuteronomy 1*; Ezekiel 17.13; Ezekiel 14.15,"amp 21, 23","Deut. 1•. Ezek. 17. 13. 14. 15. &amp; 21, 23."

In [56]:
Counter(citations).most_common(n=100)

[('Romans 8', 534),
 ('Matthew 5', 387),
 ('1 Corinthians 15', 377),
 ('Matthew 25', 370),
 ('Genesis 3', 359),
 ('Timothy 1', 351),
 ('Timothy 2', 337),
 ('Matthew 26', 323),
 ('Matthew 6', 297),
 ('Psalms 119', 295),
 ('John 6', 295),
 ('John 3.16', 293),
 ('Acts 2', 293),
 ('Romans 1', 292),
 ('Romans 13.1', 289),
 ('Romans 13', 282),
 ('John 3', 282),
 ('James 1.17', 282),
 ('Hebrews 11', 280),
 ('Luke 16', 276),
 ('1 Corinthians 11', 270),
 ('Philippians 3.8', 267),
 ('Matthew 10', 254),
 ('Genesis 3.15', 252),
 ('Isaiah 9.6', 251),
 ('Galatians 2.20', 247),
 ('John 1', 245),
 ('Romans 13.4', 242),
 ('Epistle 1', 242),
 ('Ephesians 4', 242),
 ('Matthew 13', 238),
 ('Romans 8.28', 235),
 ('2 Corinthians 5.20', 232),
 ('1 Peter 5.8', 232),
 ('1 Peter 2.13', 229),
 ('Matthew 11.28', 227),
 ('Psalms 51', 223),
 ('2 Peter 1.4', 223),
 ('Romans 6', 221),
 ('Matthew 24', 221),
 ('Romans 7.24', 219),
 ('Ephesians 2', 219),
 ('Romans 6.23', 218),
 ('Thessalonians 2', 218),
 ('Acts 20.28', 