# Parse Web-Scrapped MJP Data

This XML-parser works specifically with the web-scrapped data from the <a href="https://repository.library.brown.edu/studio/collections/id_592/">Modernist Journals Project in digital repository at Brown University Library</a>. The parser reads through two directories. The output of this notebook is two separate (.tsv) files. One holds the metadata and the other contains the content. They are linked with unique identifiers so they can be rejoined for text analysis.

In [1]:
import re
import glob
import csv
import xml.etree.ElementTree as ET

catalog_path = "/Users/williamquinn/Desktop/DH/Python/MJP/WebScrapper/Parsing_Output/catalog_file_10-04-19.txt"
text_path = "/Users/williamquinn/Desktop/DH/Python/MJP/WebScrapper/Parsing_Output/text_file_10-04-19.txt"

list_of_textFiles = glob.glob("/Users/williamquinn/Desktop/DH/Python/MJP/WebScrapper/WebDirectory_Output/Text/*.xml")
list_of_modsFiles = glob.glob("/Users/williamquinn/Desktop/DH/Python/MJP/WebScrapper/WebDirectory_Output/Mods/*.xml")

In [2]:
def get_mods_info(list_of_modsFiles, catalog_file):
'''
Input: 
    First Argument: A list of files from a directory
    Second Argument: A file to write information
Output:
    Information written immediately to file, in this case "catalog_file," and are not saved in memory.
'''
    for file_name in list_of_modsFiles:
        file = open(file_name, 'rt')
        tree = ET.parse(file)
        
        namespace = re.search(r".*{(.*)}.*", tree.getroot().tag)
        if namespace is not None:
            ns = {"ns":namespace.group(1)}
        else:
            ns = {"ns":""}
            
        file.close()
    
        try:
            title = tree.find(".//ns:titleInfo/ns:title", namespaces=ns)
            clean_title = re.sub("[^A-z ]", "", str(title.text))

            mods_id = tree.getroot().attrib['ID']
            title = tree.find(".//ns:titleInfo/ns:title", namespaces=ns).text
            date = tree.find(".//ns:dateIssued", namespaces=ns).text
            volumeIssue = tree.find(".//ns:titleInfo/ns:partNumber", namespaces=ns).text

            catalog_file.write(mods_id + "\t" + clean_title + "\t" + date + "\t" + volumeIssue + "\n")

        except:
            print (file_name)
            file_short = re.search(r".*/(\d+-mods.xml)", file_name).group(1)
            catalog_file.write(file_short + '\n')
    
def get_text(list_of_textFiles, text_file):
'''
Input: 
    First Argument: A list of files from a directory
    Second Argument: A file to write information
Output:
    Information written immediately to file, in this case "text_file," and are not saved in memory.
'''
    for file_name in list_of_textFiles:
        file = open(file_name, 'rt')
        tree = ET.parse(file)
        for name, value in tree.getroot().attrib.items():
            mods_id = value
        
        namespace = re.search(r".*{(.*)}.*", tree.getroot().tag)
        if namespace is not None:
            ns = {"ns":namespace.group(1)}
        else:
            ns = {"ns":""}
            
        file.close()
        
        group_content = []
        for group in tree.findall('.//ns:body', namespaces=ns):
            content_l = group.itertext()
            content = ' '.join(str(w) for w in content_l).replace('\t','').replace('\n','').lower()
            content_clean = re.sub(" +", " ", content)
            group_content.append(content_clean)

        text = (" ".join(str(w) for w in group_content))
        text_file.write(mods_id + "\t" + text + "\n")

In [3]:
%%time

with open(catalog_path, "w+") as catalog_file, open(text_path, "w+") as text_file:

    catalog_file.write("mjp_id" + "\t" + "magazine" + "\t" + "date" + "\t" + "volumeIssue" + "\n")
    text_file.write("mjp_id" + "\t" + "text" + "\n")
    
    get_mods_info(list_of_modsFiles, catalog_file)
    get_text(list_of_textFiles, text_file)

/Users/williamquinn/Desktop/DH/Python/MJP/WebScrapper/WebDirectory_Output/Mods/527353-mods.xml
CPU times: user 56.8 s, sys: 8.16 s, total: 1min 4s
Wall time: 1min 26s
