In [2]:
import requests
workset = requests.get("https://data.htrc.illinois.edu/ef-api/worksets/66477bae2600005807132b25")

workset_content = workset.content

In [3]:
workset_json = workset.json()

In [4]:
workset_json['data']['htids']

['ien.35556029664190',
 'mdp.39015032749130',
 'mdp.39015061153733',
 'mdp.39015047473577',
 'mdp.39015055093465',
 'inu.30000094605429',
 'mdp.39015059959802',
 'mdp.39076002787351',
 'mdp.39015066087613',
 'mdp.49015003142743']

In [11]:
import xml.etree.ElementTree as ET
import pandas as pd

In [7]:
def extract_date(field):
    tree = ET.ElementTree(ET.fromstring(field))
    find = False
    date_list = []
    for x in tree.iter():
        if x.tag in ["{http://www.loc.gov/MARC21/slim}datafield"]:
            if x.attrib["tag"] in ["DAT"]:            
                #print(x.tag)
                temp_tag = x
                find = True  
                continue
        if find:
            if x.tag in ["{http://www.loc.gov/MARC21/slim}subfield"]:          
                #print(x.text)
                date_list.append((temp_tag.tag,temp_tag.attrib,x.attrib,x.text))
            else:
                find = False
                #for y in tree.iter():
                #    #print(y)
                #    #print(x)
    return date_list

In [14]:
import datetime

def format_time(tt):    
    try:
        aha = datetime.datetime.strptime(tt,"%Y%m%d%H%M%S.0")
    except:
        try:
            aha = datetime.datetime.strptime(tt,"%Y-%m-%dT%H:%M:%SZ")
        except:
            aha = datetime.datetime.strptime(tt.split()[0],"%Y%m%d")
    return aha

In [15]:
import json
htid_list = ['ien.35556029664190',
 'mdp.39015032749130',
 'mdp.39015061153733',
 'mdp.39015047473577',
 'mdp.39015055093465',
 'inu.30000094605429',
 'mdp.39015059959802',
 'mdp.39076002787351',
 'mdp.39015066087613',
 'mdp.49015003142743']
output_list = []
for htid in htid_list:
    marc_url = f"https://catalog.hathitrust.org/api/volumes/full/htid/{htid}.json"
    marc = requests.get(marc_url)
    data = json.loads(marc.content)
    field = list(data['records'].values())[0]['marc-xml']
    date_extract = extract_date(field)
    date_extract = [[htid] + list(x) for x in date_extract]
    output_list = output_list + date_extract

field_frame = pd.DataFrame(output_list)
field_frame["dtime"] = field_frame[4].apply(lambda x: format_time(x))
field_frame

Unnamed: 0,0,1,2,3,4,dtime
0,ien.35556029664190,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '0', 'ind2': ' '}",{'code': 'a'},20010706151342.0,2001-07-06 15:13:42
1,ien.35556029664190,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '0', 'ind2': ' '}",{'code': 'b'},20110118000000.0,2011-01-18 00:00:00
2,ien.35556029664190,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '1', 'ind2': ' '}",{'code': 'a'},20120815042005.0,2012-08-15 04:20:05
3,ien.35556029664190,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '1', 'ind2': ' '}",{'code': 'b'},2023-09-13T17:55:06Z,2023-09-13 17:55:06
4,ien.35556029664190,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '2', 'ind2': ' '}",{'code': 'a'},2023-09-13T17:30:02Z,2023-09-13 17:30:02
5,mdp.39015032749130,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '0', 'ind2': ' '}",{'code': 'a'},20010212000000.0,2001-02-12 00:00:00
6,mdp.39015032749130,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '0', 'ind2': ' '}",{'code': 'b'},20220804000000.0,2022-08-04 00:00:00
7,mdp.39015032749130,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '1', 'ind2': ' '}",{'code': 'a'},20220804060506.0,2022-08-04 06:05:06
8,mdp.39015032749130,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '1', 'ind2': ' '}",{'code': 'b'},2023-11-06T18:41:46Z,2023-11-06 18:41:46
9,mdp.39015061153733,{http://www.loc.gov/MARC21/slim}datafield,"{'tag': 'DAT', 'ind1': '0', 'ind2': ' '}",{'code': 'a'},20041013144251.0,2004-10-13 14:42:51


In [16]:
# Alternate version of extract_date using "974" tag in MARC Records and grouping by volumeID. 

def extract_date_v3(htid):
    marc_url = f"https://catalog.hathitrust.org/api/volumes/full/htid/{htid}.json"
    marc = requests.get(marc_url)
    data = json.loads(marc.content)
    field = list(data['records'].values())[0]['marc-xml']
    
    tree = ET.ElementTree(ET.fromstring(field))
    
    output = []
    
    i = 1
    u_group = {}
    for x in tree.findall('//{http://www.loc.gov/MARC21/slim}datafield[@tag="974"]/'):
        output.append((htid,x.tag,i,x.attrib,x.text))
        if x.attrib=={'code': 'u'}:
            u_group[i] = x.text
        if x.attrib=={'code': 't'}:
            i+=1
    combined_df = pd.DataFrame(output,columns=["id","tag","gr","attr","val"]).merge(pd.DataFrame([u_group]).transpose(),left_on="gr",right_index=True)
    return combined_df

In [17]:
extract_date_v3("uva.x001016358")

  for x in tree.findall('//{http://www.loc.gov/MARC21/slim}datafield[@tag="974"]/'):


Unnamed: 0,id,tag,gr,attr,val,0
0,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 'b'},MIU,mdp.39015000542749
1,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 'c'},MIU,mdp.39015000542749
2,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 'd'},20221121,mdp.39015000542749
3,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 's'},google,mdp.39015000542749
4,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 'u'},mdp.39015000542749,mdp.39015000542749
5,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 'y'},1964,mdp.39015000542749
6,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 'r'},ic,mdp.39015000542749
7,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 'q'},bib,mdp.39015000542749
8,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,1,{'code': 't'},US bib date1 >= 1929,mdp.39015000542749
9,uva.x001016358,{http://www.loc.gov/MARC21/slim}subfield,2,{'code': 'b'},TXU,txu.059173025247198
