First, we'll read in a piece of example text.

In [1]:
textpath = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/eebo-tcp/tcp-txt/'

import codecs
with codecs.open(textpath + "N00244.txt", encoding='utf8') as f:
    text = f.read()
print(text)

editString = ""


      
         
            
            
            
               Ne Sutor Ultra Crepidam. OR BRIEF ANIMADVERSIONS upon the NEW-ENGLAND Anabaptists LATE FALLACIOUS NARRATIVE; Wherein the Notorious Mistakes and Falshoods by them Published, are Detected.
            By Samuel Willard Teacher of a Church in Boston in New-England.
            
            
               Prov. 18. 17.
               He that is first in his own cause seemeth just; but his neighbour cometh and searcheth him.
            
            
               Rom. 16. 17.
               Now I beseech you Brethren, mark them which cause divisions and offences, contrary to the Doctrine which ye have learned, and avoid them.
            
            
               18.
               For they that are such, serve not our Lord Jesus Christ, but their 
                     〈…〉
                  , and by good words, and fair speeches deceive the hearts of the simple.
            
            
               BOSTON IN N

Now, we'll generate the correction dictionaries needed to replace spelling variants, syncopates, and OCR errors.

In [4]:
import re

# lower case everything
text1 = text.lower()

# remove extraneous whitespace
def removeWhitespace(w):
    r = re.sub(r'\s+', ' ', w)
    if r != '':
        return r
    
text1 = removeWhitespace(text1)
editString += 'w'
    
# remove unnecessary punctuation
def removeUnnecessaryPunctuation(w):
    r = re.sub(r'[!\"#\$%&\(\)\*\+\,\/:;<=>\?@\[\\\]\^_`\{\|\}~]', '', w)
    if r != '':
        return r
    
text1 = removeUnnecessaryPunctuation(text1)
editString += 'rUP'

print(text1)



In [5]:
# also want to remove the s. or mr., so replace \s\w{1}\. with \w
def removeAbbreviatedNames(w):
    r = re.sub('\s([\w\d]){1}\.', '', w)
    if r != '':
        return r

ROMAN_NUMERALS = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/rules/romannumerals.txt'
with open(ROMAN_NUMERALS, 'r') as f:
    romanNumeralList = f.read().split('\n')
    
def removeRomanNumerals(w):
    w1 = w
    for n in romanNumeralList:
        matcher = '\W({})[\s\.]?\W'.format(n)
        w1 = re.sub(matcher, ' ', w1)
    return w1

text1 = removeRomanNumerals(text1)
editString += "rRN"
text1 = removeAbbreviatedNames(text1)
editString += "rAN"
text1



In [6]:
# compiling a list of EEBO symbols to remove
symbolsDict = {
    u'\u3008\u25ca\u3009':' ', # the diamond divider
    u'\u3008\u2026\u3009':' ', # ellipses
    u'\u2022':''
}
    
def removeSymbolsFromList(w, symbolDict):
    w1 = w
    for symbol in symbolDict.keys():
        matcher = u'\s*{}\s*'.format(symbol)
        w1 = re.sub(matcher, symbolDict[symbol], w1)
    return w1

text1 = removeSymbolsFromList(text1, symbolsDict)
editString += "rSFL"
text1



In [34]:
print(editString)

wrUPwrUPrRNrANrSFL


In [7]:
sentences = text1.split('.')
tokenizedSentences = [ sentence.strip().split(' ') for sentence in sentences]
originalSentences = list(filter(lambda s: len(s) >= 4, tokenizedSentences))
longSentences = list(filter(lambda s: len(s) >= 4, tokenizedSentences))

Now we need to replace all the words using the dictionaries we made.

In [8]:
print(longSentences)



In [9]:
weirdChars = [u'\xf3']
print(weirdChars[0])

ó


In [10]:
# correction OCR mistake dictionary
CORRECTION = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/rules/CorrectionRules.txt'
with codecs.open(CORRECTION, 'r', encoding='utf8') as f:
    lines = f.read().split('\n')
    pairs = ([ (line.split()[0], line.split()[1] ) for line in lines if line != ''])
    correctionDict = dict(pairs)
    
# correctionDict

In [11]:
docCorrections = dict()
for sentence in longSentences:
    for i in range(len(sentence)):
        word = sentence[i]
        try:
            replacement = correctionDict[word]
            sentence[i] = replacement
            docCorrections[word] = replacement
        except KeyError:
            pass     

        

(u'promifeth', u'promiseth')


In [12]:
# create syncopate dictionary
SYNCOPATE = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/rules/SyncopeRules.txt'
with codecs.open(SYNCOPATE, 'r', encoding='utf8') as f:
    lines = f.read().split('\n')
    pairs = ([ (line.split()[0], line.split()[1] ) for line in lines if line != ''])
    syncopateDict = dict(pairs)
    

In [13]:
# syncopate replacements
for sentence in longSentences:
    for i in range(len(sentence)):
        word = sentence[i]
        try:
            replacement = syncopateDict[word]
            sentence[i] = replacement
            docCorrections[word] = replacement
        except KeyError:
            pass     
        

(u"scorn'd", u'scorned')


In [14]:
# create variants dictionary
VARIANT = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/rules/VariantSpellings.txt'
with codecs.open(VARIANT, 'r', encoding='utf8') as f:
    lines = f.read().split('\n')
    pairs = ([ (line.split()[0], line.split()[1] ) for line in lines if line != ''])
    variantDict = dict(pairs)
    
# variantDict

In [15]:
for sentence in longSentences:
    for i in range(len(sentence)):
        word = sentence[i]
        try:
            replacement = variantDict[word]
            sentence[i] = replacement
            docCorrections[word] = replacement
        except KeyError:
            pass     


(u'ne', u'northeast')


In [16]:
# other variants dict
VARIANT_2 = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/rules/variants.txt'
with codecs.open(VARIANT_2, 'r', encoding='utf8') as f:
    lines = f.read().split('\n')
    pairs = [(lines[i], lines[i+1].split('\t')[1] ) for i in range(0, len(lines)-1, 2) ]
    variantDict2 = dict(pairs)
    
# variantDict2

In [17]:
for sentence in longSentences:
    for i in range(len(sentence)):
        word = sentence[i]
        try:
            replacement = variantDict2[word]
            sentence[i] = replacement
            docCorrections[word] = replacement
        except KeyError:
            pass     


(u'sutor', u'suitor')
(u'falshoods', u'falsehoods')
(u'seemeth', u'seems')
(u'cometh', u'comes')
(u'searcheth', u'searches')
(u'shewed', u'showed')
(u'non', u'none')
(u'easie', u'easy')
(u'governour', u'governor')
(u'threatned', u'threatened')
(u'governour', u'governor')
(u'wisdome', u'wisdom')
(u'publick', u'public')
(u'governour', u'governor')
(u'meerly', u'merely')
(u'concurre', u'concur')
(u'governours', u'governors')
(u'joyn', u'join')
(u'christs', u"christ's")
(u'kingdome', u'kingdom')
(u'perswade', u'persuade')
(u'perswasion', u'persuasion')
(u'meerly', u'merely')
(u'perswasion', u'persuasion')
(u'perswasion', u'persuasion')
(u'prophaneness', u'profaneness')
(u'perswasion', u'persuasion')
(u'perswasion', u'persuasion')
(u'ballance', u'balance')
(u'perswasion', u'persuasion')
(u'confesseth', u'confesses')
(u'loth', u'loath')
(u'justifie', u'justify')
(u'easie', u'easy')
(u'intreat', u'entreat')
(u'shew', u'show')
(u'meer', u'mere')
(u'pag', u'page')
(u'jealousie', u'jealousy')
(u

We should now write these to .txt files.

In [18]:
# parse catalog
import pandas as pd

catalogPath = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/eebo-tcp/tcp-texts/TCP.csv'
df = pd.read_csv(catalogPath, index_col='TCP')

nameToDateDict = df[['Date']].to_dict()['Date']

In [22]:
# get year from name of file
def getYearFromDocID(docID):
    try:
        year = nameToDateDict[docID]
        firstYear = year.split('-')[0]
        return year
    except:
        return None
    

getYearFromFile("N00244")

'1681'

In [31]:
# writing to a txt file
def writeToFile(sentenceList, docID, path):
    year = getYearFromDocID(docID)
    
    # turn list of words into sentence
    # write sentence to file
    cleanedFileName = path + '/' + year + '-' + docID + '.txt'
    
    with codecs.open(cleanedFileName, 'a+', encoding = 'utf8') as f:
        for sentence in sentenceList:
            s = ' '.join(sentence) + '. '
            f.write(s)

In [33]:
cleanedPathName = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/eebo-tcp-cleaning/cleaned_txt'
writeToFile(longSentences, "N00244", cleanedPathName)