# XML Parser for Structured MJP Data

This parser works specifically with the Modernist Journals Project. Some of the magazines have more markup, specifically with genres in each issues encoded. This parser captures genre information, such as text that has been encoded as poetry, fiction, articles, etc.

In [1]:
import re
import json
import glob
import csv
import sys
from datetime import datetime

from xml.dom.minidom import parseString
import xml.etree.ElementTree as ET

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
catalog_path = "/Users/williamquinn/Desktop/DH/Python/MJP/Output/catalog_file_7-15-2019.txt"
text_path = "/Users/williamquinn/Desktop/DH/Python/MJP/Output/text_file_7-15-2019.txt"
mods_path = "/Users/williamquinn/Desktop/DH/Python/MJP/Output/mods_file_7-15-2019.txt"
mods_catalog = "/Users/williamquinn/Desktop/DH/Python/MJP/Output/mods_cat_file.txt"

list_of_files = glob.glob("/Users/williamquinn/Desktop/DH/R/Magazines/All_Magazines/*.xml")
list_of_mods = glob.glob("/Users/williamquinn/Desktop/DH/R/Magazines/All_MODS/*/*.xml")

In [3]:
def title(article):
    for pargroup in dom.getElementsByTagName("title"):
        return pargroup.firstChild.data
    
def what_type(article):
    try:
        return article.getAttribute("type")
    except IndexError:
        raise
        return "unknown"
    
def what_id(dom):
    for element in dom.getElementsByTagName("TEI"):
        return element.getAttribute("xml:id")
    
def full_text(article):
    paragraph_list = []
    for pargroup in article.getElementsByTagName("ab"):
        try:
            paragraph = pargroup.firstChild.data
        except AttributeError:
            return "NO <AB>"

        paragraph = paragraph.replace("\n"," ").replace("\t"," ")
        paragraph_list.append(paragraph)
        
    return " ".join(paragraph_list)

In [4]:
%%time

# write to text_file and catalog_file
n = 0

# stop_words = set(stopwords.words('english'))
with open(catalog_path, "w+") as catalog_file, open(text_path, "w+") as text_file:

    catalog_file.write("id" + "\t" + "mjp_id" + "\t" + "magazine" + "\t" + "type" + "\n")
    text_file.write("id" + '\t' + "mjp_id" + "\t" + "text" + "\n")

    for file_name in list_of_files:
        FI = open(file_name, "r")
        data = FI.read()
        FI.close()
        dom = parseString(data)
        ID = what_id(dom)
        issues = dom.getElementsByTagName("div")
        for issue in issues:
            articles = issue.getElementsByTagName("div")
            for article in articles:
                n = n + 1
                atype = what_type(article)
                text = full_text(article)
                magazine = title(article)
                catalog_file.write(str(n) + "\t" + ID + "\t" + magazine + "\t" + atype + "\n")
                text_file.write(str(n) + "\t" + ID + "\t" + text + "\n")

CPU times: user 10.8 s, sys: 462 ms, total: 11.2 s
Wall time: 12.7 s


In [None]:
%%time

catalog_df = pd.read_csv("/Users/williamquinn/Desktop/DH/Python/MJP/Output/catalog_file_7-15-2019.txt", \
                 sep="\t")

catalog_df["magazine"] = catalog_df["magazine"].str.lower() \
    .str.replace(r"(the masses).*","\g<1>", regex=True) \
    .str.replace(r"the seven arts.*","the seven arts", regex=True) \
    .str.replace(r"the freewoman.*","marsden magazines", regex=True) \
    .str.replace(r"the new freewoman.*","marsden magazines", regex=True) \
    .str.replace(r"the egoist.*","marsden magazines", regex=True) \
    .str.replace(r"others[t]","others", regex=True) \
    .str.replace(r"the liitle review","the little review", regex=True)

catalog_df["type"] = catalog_df["type"].str.lower() \
    .str.replace(r"ar[^v].*","articles", regex=True) \
    .str.replace(r"a[drv][^rit].*","advertisements", regex=True) \
    .str.replace(r"poems","poetry", regex=True) \
    .str.replace(r"fic[tion].*","fiction", regex=True) \
    .str.replace(r"im.*","images", regex=True) \
    .str.replace(r"fro.*", "front", regex=True) \
    .str.replace(r"con.*", "content", regex=True)

text_df = pd.read_csv("/Users/williamquinn/Desktop/DH/Python/MJP/Output/text_file_7-15-2019.txt", \
                 sep="\t")

text_df["text"] = text_df["text"].astype(str) \
    .str.lower() \
    .str.strip() \
    .str.replace(r'[^\w\s]','', regex=True) \
    .str.replace(r"pgbrk","", regex=True) \
    .str.replace('\.0', '', regex=True)
    

mods_df = pd.read_csv("/Users/williamquinn/Desktop/DH/Python/MJP/Output/mods_file_7-15-2019.txt", \
                     sep="\t")

catalog_df = pd.merge(catalog_df, mods_df, on="mjp_id")
mjp_df = pd.merge(catalog_df, text_df, on='id')


mjp_df = mjp_df[["id", "magazine", "type", "text", "date"]]

mjp_df = mjp_df.loc[mjp_df["type"] != "images"]

mjp_df = mjp_df.rename(columns={"id": "mjp_id"})

mjp_df.to_csv("/Users/williamquinn/Desktop/DH/Python/MJP/Output/mjp_documents.txt", 
              sep='\t', 
             index = False)

# Test Version Below.

In [11]:
# write to mods_file
startTime = datetime.now()

def get_date(tree):
    for elem in tree.findall(".//ns:dateIssued", namespaces=ns):
        return elem.text
    
with open(mods_path, "w") as mods_data:
    mods_data.write("file" + "\t" + "mjp_id" + "\t" + "date" + "\n")
    
    for file in list_of_mods:
        refile = re.search(r'.*/(.*)_mods.xml', str(file)).group(1)
        read = open(file, "rt")
        tree = ET.parse(read)
        namespace = re.search(r".*{(.*)}.*", tree.getroot().tag)
        ns = {"ns":namespace.group(1)}

        mods_id = tree.getroot().attrib.get('ID')
        date = get_date(tree)
        
        mods_data.write(refile + '\t' + mods_id + '\t' + date + '\n')
    
print (datetime.now() - startTime)

0:00:03.428166


In [136]:
# %%time
        
def get_date(tree_str):
    for elem in tree_str.findall(".//ns:dateIssued", namespaces=ns):
        dateText = elem.text
        return dateText
    
def get_title(tree_str):
    for rItem in tree_str.findall(".//ns:title", namespaces=ns):
        title_Text = rItem.text
        return title_Text
        
def get_name(tree_str):
    for rItem in tree_str.findall(".//ns:name[@type='personal']/ns:namePart", namespaces=ns):
        name_Text = rItem.text
        return name_Text


with open(mods_catalog, "w") as modCat:
    modCat.write("mjp_id" + "\t" + "date" + "\t" + "name" + "\t" + "title" + "\n")
    
    for file in list_of_mods:
        refile = re.search(r'.*/(.*)_mods.xml', str(file)).group(1)
        read = open(file, "rt")
        tree = ET.parse(read)
        
        namespace = re.search(r".*{(.*)}.*", tree.getroot().tag)
        ns = {"ns":namespace.group(1)}
        mods_id = tree.getroot().attrib.get('ID')
        date = get_date(tree)
        
        for rItem in tree.findall(".//ns:relatedItem[@type='constituent']", namespaces=ns):
            name = get_name(rItem)
            title = get_title(rItem)
            modCat.write(str(mods_id) + '\t' + date + '\t' + str(name) + '\t'+ str(title) + '\n')

In [5]:
startTime = datetime.now()

mjp_count = 0

with open(text_path, "r") as text:
    for w in text.read().split():
        mjp_count = mjp_count + 1

print ("Approximate number of words in MJP:", mjp_count-8981) # 8981 is the number of id's printed with each text block
print (datetime.now() - startTime)

Approximate number of words in MJP: 12031577
0:00:03.050859


In [118]:
%%time
# write from xml with etree if necessary


def get_magazine(tree_str):
    try:
        title = tree_str.find(".//ns:teiHeader//ns:title", namespaces=ns)
        clean_title = re.sub("[^A-z ]", "", title.text)
        return (clean_title)
    except AttributeError:
        print (file_name)
        raise
    
def get_type(tree_str):
    try:
        return tree_str.get("type")
    except IndexError:
        raise
        return "unknown"
        
def get_text(tree_str):
    group_content = []
    for group in tree_str.findall('.//ns:ab', namespaces=ns):
        content_l = group.itertext()
        content = ' '.join(str(w) for w in content_l).replace('\t','').replace('\n','').lower()
        content_clean = re.sub(" +", " ", content)
        group_content.append(content_clean)
    return (" ".join(str(w) for w in group_content))


with open(catalog_path, "w") as catalog_file, open(text_path, "w") as text_file:
    n = 0

    catalog_file.write("id" + "\t" + "mjp_id" + "\t" + "magazine" + "\t" + "type" + "\n")
    text_file.write("id" + '\t' + "mjp_id" + "\t" + "text" + "\n")

    for file_name in list_of_files:
        file = open(file_name, 'rt')
        tree = ET.parse(file)
        try:
            namespace = re.search(r".*{(.*)}.*", tree.getroot().tag)
            ns = {"ns":namespace.group(1)}
        except:
            xml_id = tree.findall('ns:TEI', namespaces=ns)
            
        file.close()
        

        magazine = get_magazine(tree)
        mods_id = tree.getroot().attrib.get('{http://www.w3.org/XML/1998/namespace}id')


        for entry in tree.findall(".//ns:div[@type='issue']//ns:div", namespaces=ns):
            etype = get_type(entry)
            text = get_text(entry)
            n = n + 1
            catalog_file.write(str(n) + "\t" + mods_id + "\t" + magazine + "\t" + etype + "\n")
            text_file.write(str(n) + "\t" + mods_id + "\t" + text + "\n")



print (datetime.now() - startTime)

0:00:15.723533
