# routines to go through text and correct misspellings / OCR errors

Updating text throughout the 'trimmed' corpus tree.

a global change will require scrubbing through (text-data/):

- CC_ML_FR_trimmed_morphad_lem
- CC_ML_FR_trimmed_morphad_lem_AMER
- CC_ML_FR_trimmed_MAlem_AMER_cat_mod
- CC_ML_FR_trimmed_MAlem_AMER_cat_mod_enc_full_part

An American change will require scrubbing through the American portion of the first two and then through all the rest.

Depending on testing, the unlemmatized **Cambridge_MacLehose_FineReader_OCR_trimmed** may be treated as part of global or separately

To identify words, priorities & uses, I will need textcollections and freqdists for raw, total and AMER slices

If I decide on a programmatic scrub (~for X top tokens, collapse everything within cloze fuzzy range into main token), I'll probably want to apply it without filtering stop words

In [3]:
import os
import re
import nltk
from nltk.corpus import PlaintextCorpusReader
from fuzzywuzzy import fuzz
from fuzzywuzzy import process 

root = 'text-data/'

am_root = root + 'CC_ML_FR_trimmed_morphad_lem_AMER/'
am_cat_root = root + 'CC_ML_FR_trimmed_MAlem_AMER_cat_mod'
am_enc_root = root + 'CC_ML_FR_trimmed_MAlem_AMER_cat_mod_enc_full_part'
raw_root = root + 'Cambridge_MacLehose_FineReader_OCR_trimmed'
tot_root = root + 'CC_ML_FR_trimmed_morphad_lem'

In [4]:
from nltk.corpus import stopwords
from nltk import ngrams

eliz_stopwords = ["i",  "me",  "my",  "myself",  "we",  "our",  "ours",  "ourselves",  "you",  "your",  "yours",  "yourself",  "yourselves",  "he",  "him",  "his",  "himself",  "she",  "her",  "hers",  "herself",  "it",  "its",  "itself",  "they",  "them",  "their",  "theirs",  "themselves",  "what",  "which",  "who",  "whom",  "this",  "that",  "these",  "those",  "am",  "is",  "are",  "was",  "were",  "be",  "been",  "being",  "have",  "has",  "had",  "having",  "do",  "does",  "did",  "doing",  "a",  "an",  "the",  "and",  "but",  "if",  "or",  "because",  "as",  "until",  "while",  "of",  "at",  "by",  "for",  "with",  "about",  "against",  "between",  "into",  "through",  "during",  "before",  "after",  "above",  "below",  "to",  "from",  "up",  "down",  "in",  "out",  "o",  "on",  "off",  "over",  "under",  "again",  "further",  "then",  "once",  "here",  "there",  "when",  "where",  "why",  "how",  "all",  "any",  "both",  "each",  "few",  "more",  "most",  "other",  "some",  "such",  "no",  "nor",  "not",  "only",  "own",  "same",  "so",  "than",  "too",  "very",  "can",  "will",  "just",  "should",  "now",  "art", "doth", "dost", "'ere", "hast", "hath", "hence", "hither", "nigh", "oft", "should'st", "thither", "thee", "thou", "thine", "thy", "'tis", "'twas", "wast", "whence", "wherefore", "whereto", "withal", "would'st", "ye", "yon", "yonder"]
hk_stopwords = ['unto','u','one', 'five','upon','de','also','wee','two','may','many','would','shall','hee','like','three','doe','could','much','every','againe','bee','might','without','well','within','yet','bene','ad','foure','another','whereof','thereof','onely','next','himselfe','thus','hundred','untill','therefore','halfe','cum','selfe','non','ut', 'whole','little','sixe','full','neither','among','last','c','never','la','qui','ii','according','eight','whose','either','per','along','item','al','likewise','mee','whereupon','none','till','able','thousand','self','el','second','que','mine','quae','sunt','et','seven','iii','although','litle','si','notwithstanding','besides','etiam','lesse','e','even','vel','alwayes', 'third','ever','rather','whether','still','otherwise','large','amongst', 'greater','somewhat','ex','least','aforesaid','though','whatsoever','quam', 'ten','whereby','foorth','no', 'n','los','almost','twelve','howbeit','j', 'greatly','ac','yce', 'pro','en','ab','greatest','whereas','hoc','w','beene','doeth','eorum','con','withall','hereafter','moreover','nec','nine','noone','omnes','del','enim','often']
latin_stopwords = ['ab', 'ac', 'ad', 'adhuc', 'aliqui', 'aliquis', 'an', 'ante', 'apud', 'at', 'atque', 'aut', 'autem', 'cum', 'cur', 'de', 'deinde', 'dum', 'ego', 'enim', 'ergo', 'es', 'est', 'et', 'etiam', 'etsi', 'ex', 'fio', 'haud', 'hic', 'iam', 'idem', 'igitur', 'ille', 'in', 'infra', 'inter', 'interim', 'ipse', 'is', 'ita', 'magis', 'modo', 'mox', 'nam', 'ne', 'nec', 'necque', 'neque', 'nisi', 'non', 'nos', 'o', 'ob', 'per', 'possum', 'post', 'pro', 'quae', 'quam', 'quare', 'qui', 'quia', 'quicumque', 'quidem', 'quilibet', 'quis', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quo', 'quoniam', 'sed', 'si', 'sic', 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero']
pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'you', 'they', 'me', 'you', 'him', 'her', 'it', 'you', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'your', 'their', 'mine', 'yours', 'his', 'hers', 'its', 'ours', 'yours', 'theirs', 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'yourselves', 'themselves']
modals = ['shall','shal','shalt','should', 'can', "can't", 'cannot', 'could', 'will','wil', 'would', 'may', 'must', 'might', 'ought', 'need', 'have', 'has']
directives = ['without', 'within', 'there', 'thence','away','est','towards','toward','farre','betweene','wherein','therein']
stop_words = set(stopwords.words('english') + latin_stopwords + eliz_stopwords + hk_stopwords + ['']) - set(pronouns + modals + directives)
stop_words_max = set(stopwords.words('english') + latin_stopwords + eliz_stopwords + hk_stopwords + [''] + pronouns + modals + directives + ['great', 'make', 'good', 'part', 'certain', 'thing', 'wherewith', 'afterward', 'day'])



In [5]:
s = 'whose care hath bene more generali 5people. And how may it be thought ;5people, generali? generaliii'
old_test = '5people'
new_test = 'NEW'
print(re.sub(r'([^a-zA-Z]|^)('+ old_test + r')([^a-zA-Z])',r'\1' + new_test +r'\3', s, flags=re.I))

whose care hath bene more generali NEW. And how may it be thought ;NEW, generali? generaliii


In [6]:
def replace(old, new, scope):
    '''
    replaces all instances of 'old' token with 'new' token across corpuses defined through 'scope'
    arguments:
        old (str) : token to be replaced
        new (str) : substitute token
        scope (str) : 
            'am' : American materials
            'tot' : all lemmatized materials
            'raw' : unlemmatized (complete collection)
            'totnraw' : both lemmatized and unlemmatized folders '''
    #since all but 'raw' scope includes the regrettably fragmented American materials, pre-load them 
    folders = [am_root, am_cat_root, am_enc_root]
    if scope == 'raw':
        folders = [raw_root]
    elif scope == 'am':
        folders += []
    elif scope == 'tot':
        folders += [tot_root]
    elif scope == 'totnraw':
        folders += [tot_root, raw_root]
    elif scope == 'test':
        folders = [root + 'test/']
    else:
        raise ValueError('unrecognized scope argument')
    #print(folders)
    for folder in folders:
            filelist = os.scandir(folder)
            for entry in filelist:
                with open(entry.path, 'r', encoding="utf8") as fr:
                        text = fr.read()
                        # replacement through regex to account for adjacent punctuation & ignore case
                        # regex fails if 'new' starts with a number; prefacing the number with a space gets around that
                        text = re.sub(r'([^a-zA-Z]|^)('+ old + r')([^a-zA-Z])',r'\1' + new +r'\3', text, flags=re.I)
                with open(entry.path, 'w', encoding="utf8") as fw:
                        fw.write(text)
    with open (root + 'replacement_record.txt', 'a', encoding="utf8") as f:
        f.write(old + ' -> ' + new + ' in ' + scope + '\n')

In [7]:
#corpuses
am = PlaintextCorpusReader(am_root, '.*')
am_fd = nltk.FreqDist(word for word in am.words() if word.isalpha() and word not in stop_words)
am_col = nltk.TextCollection(am)

raw = PlaintextCorpusReader(raw_root, '.*')
raw_fd = nltk.FreqDist(word for word in raw.words() if word.isalpha() and word not in stop_words)
raw_col = nltk.TextCollection(raw)

tot = PlaintextCorpusReader(tot_root, '.*')
tot_fd = nltk.FreqDist(word for word in tot.words() if word.isalpha() and word not in stop_words)
tot_col = nltk.TextCollection(tot)

In [8]:
# violence flags for pandas conditional formatting
viol_flags = ['violence', 'harm', 'injury', 'injure', 'hurt', 'damage', 'scathe', 'wound', 'maim', 'cripple', 'mutilate', 'cut', 'mangle', 'torture', 'torment', 'wound', 'gash', 'bruise', 'abuse', 
              'bloody', 'bloodshed', 'bloodshedder', 'bloodshedding', 'blood', 'hit', 
              'fight', 'scrap', 'struggle', 'conflict', 'melee', 'brawl', 'combat', 'wrestle', 'wrestler',
              'kill', 'death', 'slay', 'murder', 'assassinate', 'assasin', 'massacre', 'slaughter', 'butcher', 'slaughter', 'manslaughter', 
              'battle', 'war', 'siege', 'attack', 'assault', 'skirmish', 'skirmisher', 'enemy', 'foe', 'hostile', 'army', 'soldier', 'warrior', 'conquer', 'conqueror',  'conquest'
              'detain', 'capture', 'captive', 'imprison', 'gaol', 'prisoner', 'slave', 'enslave',
              'shoot', 'shot',  'blast', 'burn', 'fire', 'blaze',
              'cruel', 'cruelty', 'destroy', 
              'arrow', 'crossbow', 'dart', 'javelin', 'mace', 'club', 'sword', 'lance', 'spear', 'rapier', 'pike', 'target', 'buckler', 'falchion', 'halberd', 'partisan', 
              'musket', 'gun', 'bullet', 'caliver', 'culverin', 'harquebus', 'harquebusier', 'saker', 'cannon']
''' 
die is filtered out on account of dyes
pain is sometimes related to legal pressure / coercion, but rarely to physical viol
ruin doesn't quite apply to human bodies
agony just doesn't make an appearance
bow is frequently a bowing action, but arrow and shoot should cover it
piece can be many things
'''

" \ndie is filtered out on account of dyes\npain is sometimes related to legal pressure / coercion, but rarely to physical viol\nruin doesn't quite apply to human bodies\nagony just doesn't make an appearance\nbow is frequently a bowing action, but arrow and shoot should cover it\npiece can be many things\n"

In [66]:
tot_col.concordance('pemisapans', 200)

Displaying 4 of 4 matches:
 like to be starve , but the other false . nevertheless until my return it take such effect in pemisapans breast , and in those against we , that they grow not only into contempt of we , but also ( co
e attempt to run away , i lay he in the bylboe , threaten to cut off his head , who i remit at pemisapans request : whereupon he be persuade that he be our enemy to the death , he do not only feed he 
ake much of he , he flat discover all unto i , which also afterward be reveal unto i by one of pemisapans own man , that night before he be slay . Theise mischief be all instant upon i and my company 
gligence to have be intercept by the savage , we meet he Pemisapan return out of the wood with pemisapans head in slay - he hand .. This fall out the first of June 1586 , and the eight of the same com


In [62]:
# 86 threshold for 'people'
for word in tot_col.vocab():
    if fuzz.ratio('pemisapan', word) > 70:
        print(word)

Pemisapan
permians
pemisapans
embiavan
mishapen
fpemisa
minsapa


In [69]:
replace('naturali','natural','totnraw')

In [12]:
tot_fd.most_common(50)

[('they', 28743),
 ('we', 23018),
 ('have', 18427),
 ('he', 15611),
 ('our', 12546),
 ('i', 12152),
 ('their', 11770),
 ('it', 11196),
 ('his', 10625),
 ('great', 8019),
 ('there', 7525),
 ('come', 7476),
 ('say', 6709),
 ('man', 6633),
 ('shall', 6015),
 ('ship', 5681),
 ('day', 5645),
 ('will', 5626),
 ('make', 5580),
 ('go', 4848),
 ('may', 4604),
 ('you', 4378),
 ('take', 4370),
 ('good', 4098),
 ('place', 3772),
 ('time', 3737),
 ('call', 3659),
 ('land', 3646),
 ('king', 3419),
 ('see', 3392),
 ('can', 3337),
 ('country', 3253),
 ('find', 3114),
 ('my', 2925),
 ('sea', 2918),
 ('part', 2914),
 ('river', 2724),
 ('give', 2588),
 ('your', 2547),
 ('certain', 2518),
 ('send', 2506),
 ('league', 2497),
 ('thing', 2489),
 ('captain', 2454),
 ('year', 2414),
 ('island', 2400),
 ('town', 2360),
 ('hand', 2336),
 ('bring', 2285),
 ('water', 2282)]