# WWP Sort and Write Parser

This parser uses the encoding practices of the WWP to sort texts into various categories. These different datasets help create different models for word2vec.

In [1]:
from lxml import etree
import xml.etree.ElementTree as ET
import fnmatch
import re
import glob
import csv
import sys
import os
from datetime import datetime

In [2]:
def morph2text(file_name):
'''
Input: an .xml file
Output: a string to be written to a .txt file
'''
    with open(file_name, 'rt') as read_file:
        
        parser = etree.XMLParser(ns_clean=True, recover = True)
        morphTree = ET.parse(read_file, parser=parser)
        content_list = []

        for node in morphTree.findall('.//reg'):
            lines = ''.join(ET.tostring(node,
                                        encoding='unicode',
                                        method='text')).replace('\n',' ').replace('\t',' ').strip()
            clean_lines = re.sub(' +',' ', lines)
            content_list.append(clean_lines)

        return ' '.join(str(v) for v in content_list).replace('\t','').replace('\n','').lower()

def get_pubPlace(tree):
'''
Input: the tree structure of an .xml file
Output: a string of text from the specified tag
'''
    pubPlace = tree.find('.//wwp:sourceDesc//wwp:imprint/wwp:pubPlace', namespaces=ns).text.lower()
    return (pubPlace)

In [8]:
list_of_files = glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/fullTexts-XSLT/*.xml")

seven_files = glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/17c/morphadorned/*.xml")
eight_files = glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/18c/morphadorned/*.xml")
nine_files = glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/19c/morphadorned/*.xml")

seventeenth_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph-eme_17c.txt'
eighteenth_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph-eme_18c.txt'
nineteenth_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph_19c.txt'
all_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph_allTexts.txt'

us_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph_us.txt'
global_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph_non-us.txt'

eme_p_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph-eme_gi-p.txt'
eme_lg_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph-eme_gi-lg.txt'

gi_p_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph_gi-p.txt'
gi_lg_path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/wwo_reg-morph_gi-lg.txt'


ns = {'wwp':'http://www.wwp.northeastern.edu/ns/textbase'}

In [6]:
# Write by Century

startTime = datetime.now()

csv.field_size_limit(sys.maxsize)

with open(seventeenth_path, "a") as seventhC, open(eighteenth_path, "a") as eighthC, open(nineteenth_path, "a") as ninethC, open(all_path, "a") as allTexts:
    
    for file_name in seven_files:
        seven_content = morph2text(file_name)
        seventhC.write(seven_content + '\n')
        allTexts.write(seven_content + '\n')
        
    for file_name in eight_files:
        eight_content = morph2text(file_name)
        eighthC.write(eight_content + '\n')
        allTexts.write(eight_content + '\n')
    
    for file_name in nine_files:
        nine_content = morph2text(file_name)
        ninethC.write(nine_content + '\n')
        allTexts.write(nine_content + '\n')
        
        
print (datetime.now() - startTime)

0:06:38.971105


In [7]:
usPubPlaces = ['newport, ri', 'newburyport, ma', 'philadelphia and new york', 
            'washington, dc', 'boston', 'new york', 'philadelphia', 'baltimore', 
            'salem, massachusetts', 'dedham, ma', 'cambridge, massachusetts']

In [8]:
# Write by Place
startTime = datetime.now()
                
with open(us_path, "a") as usTexts, open(global_path, "a") as globalTexts:
    
    for file_name in list_of_files:
        file = open(file_name, 'rt')
        tree = ET.parse(file)
        place = get_pubPlace(tree)
        file.close()
        file_regex = re.search(r'full-(.*).xml', str(file_name)).group(1)
    
        
        for name in glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/1?c/morphadorned/*.xml"):
            if str(file_regex) in name:
                content_text = morph2text(name)
        
                if place in usPubPlaces:
                    usTexts.write(content_text + '\n')
                else:
                    globalTexts.write(content_text + '\n')

print (datetime.now() - startTime)

0:07:00.048962


In [6]:
# Write by Element
startTime = datetime.now()
                
with open(gi_p_path, "a") as gi_p, open(gi_lg_path, "a") as gi_lg:
    for file in glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/gi-lg/reg-morph/*.xml"):
        content_text = morph2text(file)
        gi_p.write(content_text)
        
    for file in glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/gi-p/reg-morph/*.xml"):
        content_text = morph2text(file)
        gi_lg.write(content_text)
        
with open(eme_p_path, "a") as eme_p, open(eme_lg_path, "a") as eme_lg:
    for file in glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/gi-lg/morph-eme/*.xml"):
        content_text = morph2text(file)
        eme_p.write(content_text)
        
    for file in glob.glob("/Users/williamquinn/Desktop/DH/Python/WWP/Morphadorned/gi-p/morph-eme/*.xml"):
        content_text = morph2text(file)
        eme_lg.write(content_text)


print (datetime.now() - startTime)

0:08:08.621423


In [None]:
# Write by .xml (post-morphadorned)
startTime = datetime.now()
                
path = '/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/post-morphadorned/'
for file in glob.glob('/Users/williamquinn/Desktop/DH/Python/WWP/WWP Word Vectors/Output/*-morph/*.xml'):
#     file_regex = re.search(r'full-(.*).xml', str(file_name)).group(1)
    with open(path + file_regex + ".txt", "w") as file2write:
        content_text = morph2text(file2write)
        file2write.close()
    

print (datetime.now() - startTime)

In [10]:
startTime = datetime.now()

sev_count = 0
eig_count = 0
nin_count = 0
all_count = 0
us_count = 0
global_count = 0
gi_p_count = 0
gi_lg_count = 0
eme_p_count = 0
eme_lg_count = 0

with open(us_path, "r") as usTexts, open(global_path, "r") as globalTexts:
    for w in usTexts.read().split():
        us_count = us_count + 1
    for w in globalTexts.read().split():
        global_count = global_count + 1

with open(seventeenth_path, "r") as sev, open(eighteenth_path, "r") as eig, open(nineteenth_path, "r") as nin, open(all_path, "r") as allt:
    for w in sev.read().split():
        sev_count = sev_count + 1
    for w in eig.read().split():
        eig_count = eig_count + 1
    for w in nin.read().split():
        nin_count = nin_count + 1
    for w in allt.read().split():
        all_count = all_count + 1
        
with open(gi_p_path, "r") as gi_p, open(gi_lg_path, "r") as gi_lg, open(eme_p_path, "r") as eme_p, open(eme_lg_path, "r") as eme_lg:
    for w in gi_p.read().split():
        gi_p_count = gi_p_count + 1
    for w in gi_lg.read().split():
        gi_lg_count = gi_lg_count + 1
        
    for w in eme_p.read().split():
        eme_p_count = eme_p_count + 1
    for w in eme_lg.read().split():
        eme_lg_count = eme_lg_count + 1



print ("Number of Words", "\nAll texts count:", all_count) 
print ("\nUS texts count:", us_count, "\nGlobal texts count:", global_count)
print ("17C texts count:", sev_count, "\n18c texts count:", eig_count, "\n19c texts count:", nin_count)
print ("\ngi-p", gi_p_count, "\ngi-lg", gi_lg_count, "\neme-p", eme_p_count, "\neme-lg", eme_lg_count)
print ('\n', datetime.now() - startTime)

Number of Words 
All texts count: 12440955

US texts count: 2468812 
Global texts count: 10386905
17C texts count: 4496620 
18c texts count: 3963521 
19c texts count: 3980814

gi-p 5574904 
gi-lg 9476694 
eme-p 5572887 
eme-lg 9472856

 0:00:16.856126
