Import necessary libraries

In [3]:
import html
import csv 
import requests 
import xml.etree.ElementTree as ET 

Define loadRSS() function to perform get request and create the EAD XML file locally

In [4]:
def loadRSS():
    url = 'https://archives.lib.duke.edu/catalog/africanamericansfilm/xml'
    resp = requests.get(url) 
    with open('afamfilm.xml', 'wb') as f: 
        f.write(resp.content)

Use the ElementTree XML API to parse the XML file

In [5]:
ElementTree = ET.parse('afamfilm.xml')

Use the getroot() method to isolate root element and then print function to show the root's value. Copying the root value will be necessary for the parseXML() function below

In [6]:
root = ElementTree.getroot()
print(root)

<Element '{urn:isbn:1-931666-22-9}ead' at 0x7fed5bf94090>


Beginning with the root element, loop down through the XML file hierarchy to reveal the EAD headings present in the document

In [None]:
for a in root:
    print(a)
    for b in a:
        print(b)
        for c in b:
            print(c)
            for d in c:
                print(d)
                for e in d:
                    print(e)

Having identified what values in the EAD are necessary for your entity data model, use the .findall() function and looping to create a dictionary representation of each item and add each dictionary to the list of items. Note the use of the html.unescape function here due to the use of embedded html elements in the EAD XML file within <scopecontent> fields

In [8]:
def parseXML(xmlfile):
    import html
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    items = []
    for item in root.findall('.//{urn:isbn:1-931666-22-9}c01'):
        data = {}
        for did_headings in item.findall('.//{urn:isbn:1-931666-22-9}did'):
            for unittitle in did_headings.findall('.//{urn:isbn:1-931666-22-9}unittitle'):
                for title in unittitle.findall('.//{urn:isbn:1-931666-22-9}title'):
                    data['title'] = title.text
            for unitdate in did_headings.findall('.//{urn:isbn:1-931666-22-9}unitdate'):
                data['date'] = unitdate.text
            for container in did_headings.findall('.//{urn:isbn:1-931666-22-9}container'):
                data['container'] = (container.attrib['type'] + ' ' + container.text)
        for scopecontent in item.findall('.//{urn:isbn:1-931666-22-9}scopecontent'):
            for p in scopecontent.findall('.//{urn:isbn:1-931666-22-9}p'):
                data['description'] = html.unescape(p).text.replace('\n           ','')
        items.append(data)
    return items

Define a savetoCSV funtion to write your items list to a .csv file. Replace the column names as necessary 

In [9]:
def savetoCSV(items, csv_file):
    import csv
    columns = ['title','date','container','description']
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=columns)
        writer.writeheader()
        for item in items:
            writer.writerow(item)

Here is the main script. Run this code block after all previous cells to produce required archival item data as a .csv

In [10]:
loadRSS()
items = parseXML('afamfilm.xml')
savetoCSV(items, 'afamfilm.csv')

Code cells below this comment demonstrate how the pandas library can be used to read the resulting .csv into a dataframe and subsequent methods can clean and deduplicate data for other entity categories present in the archival item data

In [69]:
import pandas as pd

In [70]:
items = pd.read_csv('afamfilm.csv')

In [78]:
films = items[['title','date']]

In [79]:
films.head()

Unnamed: 0,title,date
0,48 Hours,1982
1,A.K.A. Cassius Clay,1970
2,Aaron Loves Angela,1975
3,Abby,1974
4,Across 110th Street,1972


In [80]:
films

Unnamed: 0,title,date
0,48 Hours,1982
1,A.K.A. Cassius Clay,1970
2,Aaron Loves Angela,1975
3,Abby,1974
4,Across 110th Street,1972
...,...,...
466,"World's Greatest Athlete, The",1973
467,Wusa,1970
468,Youngblood,1978
469,Zebra Killer,1974


In [81]:
films = films.drop_duplicates()
films

Unnamed: 0,title,date
0,48 Hours,1982
1,A.K.A. Cassius Clay,1970
2,Aaron Loves Angela,1975
3,Abby,1974
4,Across 110th Street,1972
...,...,...
466,"World's Greatest Athlete, The",1973
467,Wusa,1970
468,Youngblood,1978
469,Zebra Killer,1974


In [82]:
films.to_csv('films.csv')

In [83]:
containers = items['container']

In [85]:
containers = containers.drop_duplicates()

In [86]:
containers


0                   box 1
2      oversize-folder 14
3       oversize-folder 1
4                   box 5
16                 tube 1
26     oversize-folder 15
40     oversize-folder 16
48      oversize-folder 2
59     oversize-folder 17
72     oversize-folder 18
81      oversize-folder 3
84                  box 6
105    oversize-folder 19
121    oversize-folder 20
132                 box 2
133     oversize-folder 4
145    oversize-folder 21
167    oversize-folder 22
188     oversize-folder 5
199    oversize-folder 23
220     oversize-folder 6
221    oversize-folder 24
239    oversize-folder 25
241                 box 3
257    oversize-folder 26
265     oversize-folder 7
289    oversize-folder 27
293     oversize-folder 8
310    oversize-folder 28
339                 box 4
343    oversize-folder 29
347     oversize-folder 9
360    oversize-folder 30
364    oversize-folder 10
383    oversize-folder 31
403    oversize-folder 32
425    oversize-folder 11
442    oversize-folder 12
451    overs

In [89]:
containers.to_csv("containers.csv")