#Harvest and modify bib records for digitized newspapers freom Library of Congress
## Load the required libraries

In [1]:
import os
import urllib
import pymarc
import marcx
import numpy as np
import pandas as pd
from xml.etree.ElementTree import fromstring, ElementTree
import time
import codecs
# open a file to write the output to
out = codecs.open('newspaper_recs1.xml','w','utf-8')
recs_str = '<?xml version="1.0" encoding="UTF-8" standalone="no"?><collection>'
footer = '</collection>'


# Library of Congress has a list of digitized newspapers on the website in the following format:
Persistent Link | State | Title | LCCN | OCLC | ISSN | No. of Issues | First Issue Date | Last Issue Date | More Info

http://chroniclingamerica.loc.gov/lccn/sn85025905/issues/ | Alabama | Chattanooga daily rebel. (Selma, Ala.) 1865-1865 | sn85025905 | 2638820 | 2328-5869 | 3 | April 19, 1865 | April 27, 1865 | http://chroniclingamerica.loc.gov/lccn/sn85025905/essays/
http://chroniclingamerica.loc.gov/lccn/sn82015209/issues/ | Alabama | The Chattanooga Daily Rebel. (Chattanooga, Tenn.) 1862-1864 | sn82015209 | 8807897 | 2328-5826 | 228 | Aug. 9, 1862 | April 22, 1864 | http://chroniclingamerica.loc.gov/lccn/sn82015209/essays/
http://chroniclingamerica.loc.gov/lccn/sn82014371/issues/ | Alabama | The daily Chattanooga rebel. (Griffin, Ga.) 1864-1865 | sn82014371 | 8793427 | 2328-5761 | 41 | June 9, 1864 | April 15, 1865 | http://chroniclingamerica.loc.gov/lccn/sn82014371/essays/
http://chroniclingamerica.loc.gov/lccn/sn83045160/issues/ | Alabama | Memphis daily appeal. (Memphis, Tenn.) 1847-1886 | sn83045160 | 9355541 | 2166-1898 | 8088 | Jan. 1, 1857 | Jan. 27, 1886 | http://chroniclingamerica.loc.gov/lccn/sn83045160/essays/

# We read the library of congress list of digitized newspapers as a list of lines

In [2]:

url = 'http://chroniclingamerica.loc.gov/newspapers.txt'
f = urllib.request.urlopen(url)
lines = f.readlines()

# we do a bit of cleanup. 
The first line is a header line with columns separate by the pipe symbol '|'
we decode the line to convert the type from bytes to a string, remove the end of line character, and split the line on the pipe character to create a list of column names. We use this to create our dataframe

In [3]:

for line in lines[0:1]:
    c = line.decode().replace('\n','').split('|')
    i = 0
    while i < 10:
        c[i] = c[i].lstrip().rstrip()
        i+=1
    print("headers: ",c)
df = pd.DataFrame(columns=c)
# for each line in the list of lines, we create a list following a similar process to above. For each line
# we create a dict that is appended to the dataframe
for line in lines[1:]:
    x = line.decode().replace('\n','').split('|')
    d = {}
    z = 0
    if len(x) == 10:
        while z < 10:
            d[c[z]] = x[z].lstrip().rstrip()
            z += 1
        df = df.append(d,ignore_index=True)
df.head()

headers:  ['Persistent Link', 'State', 'Title', 'LCCN', 'OCLC', 'ISSN', 'No. of Issues', 'First Issue Date', 'Last Issue Date', 'More Info']


Unnamed: 0,Persistent Link,State,Title,LCCN,OCLC,ISSN,No. of Issues,First Issue Date,Last Issue Date,More Info
0,http://chroniclingamerica.loc.gov/lccn/sn85025...,Alabama,"Chattanooga daily rebel. (Selma, Ala.) 1865-1865",sn85025905,2638820,2328-5869,3,"April 19, 1865","April 27, 1865",http://chroniclingamerica.loc.gov/lccn/sn85025...
1,http://chroniclingamerica.loc.gov/lccn/sn82015...,Alabama,"The Chattanooga Daily Rebel. (Chattanooga, Ten...",sn82015209,8807897,2328-5826,228,"Aug. 9, 1862","April 22, 1864",http://chroniclingamerica.loc.gov/lccn/sn82015...
2,http://chroniclingamerica.loc.gov/lccn/sn82014...,Alabama,"The daily Chattanooga rebel. (Griffin, Ga.) 18...",sn82014371,8793427,2328-5761,41,"June 9, 1864","April 15, 1865",http://chroniclingamerica.loc.gov/lccn/sn82014...
3,http://chroniclingamerica.loc.gov/lccn/sn83045...,Alabama,"Memphis daily appeal. (Memphis, Tenn.) 1847-1886",sn83045160,9355541,2166-1898,8088,"Jan. 1, 1857","Jan. 27, 1886",http://chroniclingamerica.loc.gov/lccn/sn83045...
4,http://chroniclingamerica.loc.gov/lccn/sn94051...,Arizona,"The argus. (Holbrook, Ariz.) 1895-1900",sn94051341,25084608,2375-169X,220,"Dec. 12, 1895","May 5, 1900",http://chroniclingamerica.loc.gov/lccn/sn94051...


# we grab the oclc numbers as a Pandas Series from the dataframe and use the oclc number to query OCLC WorldCat to grab the marcxml record. Each record is appended to a string and written to the output file.

In [None]:
## OCLC requires an api key to use the WorldCat Search API. 
## You can request a key at this address:  http://www.oclc.org/developer/develop/web-services/worldcat-search-api.en.html
WSKEY = <YourWorldCatSearchAPIKey>

oclc_list = df['OCLC']
wc_base = 'http://www.worldcat.org/webservices/catalog/content/'
wc_end = '?wskey=' + WSKEY

recs_str = '<?xml version="1.0" encoding="UTF-8" standalone="no"?><collection>'
footer = '</collection>'
out.write(recs_str)
for oclc in oclc_list:
    time.sleep(1)
    wc_url = (wc_base + oclc + wc_end).replace(' ','')
    f = urllib.request.urlopen(wc_url).read().decode().replace('\n','')
    recs_str += f[54:]
    out.write(f[54:])
recs_str += footer
out.write(footer)
out.close()


# We now have the harvested marcxml records in the file: newspaper_recs.xml 
We will parse those into marc records using the PyMarc library. marc_records holds a list of marc records.


In [4]:
marc_records = pymarc.parse_xml_to_array(codecs.open('newspaper_recs.xml','rb','utf-8'))

In [5]:
for rec in marc_records:
    print(rec)
    print()

=LDR  00000cas a2200000 a 4500
=001  2638820
=008  761221d18651865alu\x\ne\\\\\\0\\\a0eng\\
=010  \\$asn 85025905 $z   76640531 
=022  0\$a2328-5869$21
=130  0\$aChattanooga daily rebel (Selma, Ala. : 1865)
=222  \0$aChattanooga daily rebel (Selma, Ala. 1865. Print)
=245  10$aChattanooga daily rebel.
=246  30$aDaily rebel
=260  \\$aSelma, Ala. :$bF.M. Paul
=300  \\$av.
=651  \0$aSelma (Ala.)$vNewspapers.
=651  \0$aDallas County (Ala.)$vNewspapers.
=655  \7$aNewspapers.$2fast$0(OCoLC)fst01423814
=651  \7$aAlabama$zDallas County.$2fast$0(OCoLC)fst01207055
=651  \7$aAlabama$zSelma.$2fast$0(OCoLC)fst01216305
=856  41$uhttp://www.loc.gov/chroniclingamerica/lccn/sn85025905/issues


=LDR  00000cas a2200000 a 4500
=001  8807897
=008  820925d18621864gaudn\ne\\\\\\0\\\a0eng\\
=010  \\$asn 82015209 
=022  0\$a2328-5826$21
=222  \4$aThe Chattanooga daily rebel$b(Print)
=245  04$aThe Chattanooga Daily Rebel.
=246  1\$aDaily Rebel
=246  1\$iIssue for <Aug. 9, 1862> also called:$aRebel
=260  \\$aChat

#We notice that several records (almost 300) have either incorrect URLs or no URL.

In [6]:
for rec in marc_records:
    rec = marcx.FatRecord.from_record(rec)
    print(rec.title())
    print(rec.publisher(), ' / ', rec['260']['a'])
    try:
        print(rec['856']['u'])
    except Exception as e:
        pass
    print()

Chattanooga daily rebel.
F.M. Paul  /  Selma, Ala. :
http://www.loc.gov/chroniclingamerica/lccn/sn85025905/issues

The Chattanooga Daily Rebel.
F.M. Paul  /  Chattanooga, Tenn. :
http://www.loc.gov/chroniclingamerica/lccn/sn82015209/issues

The daily Chattanooga rebel.
Franc. M. Paul  /  Griffin, Ga. :
http://www.loc.gov/chroniclingamerica/lccn/sn82014371/issues

Memphis daily appeal.
S.T. Seawell & W.N. Stanton,  /  Memphis, Tenn. :
http://www.loc.gov/chroniclingamerica/lccn/sn83045160/issues

The argus.
A.F. Banta,  /  Holbrook, Ariz. :
http://www.loc.gov/chroniclingamerica/lccn/sn94051341/issues

The Arizona champion.
A.E. Fay,  /  Peach Springs, Mohave County, A.T. [Ariz.] :
http://www.loc.gov/chroniclingamerica/lccn/sn82016246/issues

Arizona citizen.
J. Wasson,  /  Tucson, Pima County, A.T. [i.e. Ariz.] :
http://www.loc.gov/chroniclingamerica/lccn/sn82014896/issues

The Arizona daily orb.
A.W. Howe,  /  Bisbee, Ariz. :
http://www.loc.gov/chroniclingamerica/lccn/sn94050505/issues


TypeError: 'NoneType' object is not subscriptable

In [7]:
counter = 0
for rec in marc_records:
    try:

        if not 'chroniclingamerica' in rec['856']['u']:
            counter += 1
            print(counter)
            print(rec)
    except Exception as e:
        counter +=1
        print('***********')
        print('No 856')
        print(counter)
        print(rec)
        print('***********')

***********
No 856
1
=LDR  00000cas a2200000Ma 4500
=001  213806157
=008  840204d1898197uazuwr\nea\\\\\0\\\\0eng\\
=010  \\$asn 87062055 
=130  0\$aCoconino sun (Flagstaff, Ariz. : 1898)
=245  14$aThe Coconino sun$h[microform].
=260  \\$aFlagstaff, Ariz. :$bC.M. Funston
=300  \\$av.
=500  \\$aCeased between 1978 and 1979?
=500  \\$a"Independent." Cf. Ayer, 1979.

***********
2
=LDR  00000cas a2200000 a 4500
=001  36207053
=008  970114d18871890azudr\ne\\\\\\0\\\a0eng\\
=010  \\$asn 96060681 
=022  1\$a2157-801X$21
=130  0\$aTombstone daily epitaph (Tombstone, Ariz. : 1887)
=222  \0$aTombstone daily epitaph$b(Tombstone, Ariz. 1887)
=245  10$aTombstone daily epitaph.
=246  13$aDaily Tombstone epitaph
=246  17$aDaily epitaph
=246  17$aTombstone epitaph
=260  \\$aTombstone, Ariz. :$bJ.O. Dunbar,$c1887-1890.
=300  \\$av.
=651  \0$aTombstone (Ariz.)$vNewspapers.
=651  \0$aCochise County (Ariz.)$vNewspapers.
=651  \7$aArizona$zCochise County.$2fast$0(OCoLC)fst01207405
=651  \7$aArizona$zTombst

# We can will remove the existing URL and replace it with the URL from the Library of Congress list

##First we will define a function to return the desired 856 field

In [8]:
def get_856(rec):
    #print(rec) #testing
    oclc = rec['001'].data
    #print(rec['001']) #testing
    #print(oclc) #testing
    mask = df['OCLC'] == oclc
    d = df[mask]
    url = d['Persistent Link']
    try:
        url = url.values[0]
    except Exception as e:
        url = 'not found'
        if rec.has('010.a'):
            lccn = rec['010']['a']
            url = 'http://chroniclingamerica.loc.gov/lccn/' + lccn.replace(' ','') +'/issues/'
    return(url)

##Here we define a function to add and remove the desired fields

In [9]:
def add_remove_fields(rec):
    oclc = rec['001'].data
    url = get_856(rec)
    note = generate_note(rec)
    if rec.has('001'):
        rec.remove('001')
        _035 = pymarc.Field(tag = '035',indicators = ['0',' '],subfields = ['a', '(OCoLC)' + oclc])
        rec.add_ordered_field(_035)
    if rec.has('856'):
        rec.remove('856')
    rec.add('856', u = url, z = 'Chronicling America', indicators=['4','0'])
    if len(note) > 0:
        _500 = pymarc.Field(tag = '500',indicators = [' ',' '],subfields = ['a', note])
        rec.add_ordered_field(_500)
    return(rec)

###Let's also generate a note field that contains information about the issuues digitized

In [10]:
def generate_note(rec):
    note = ''
    oclc = rec['001'].data 
    mask = df['OCLC'] == oclc
    d = df[mask]
    issues = d['No. of Issues']
    sdate = d['First Issue Date']
    edate = d['Last Issue Date']
    issues = issues.values[0]
    sdate = sdate.values[0]
    edate = edate.values[0]
    note = 'Includes ' + issues + ' issues published between ' + sdate + ' and ' + edate
    return(note)

In [11]:
##test the function
rec = marcx.FatRecord.from_record(marc_records[10])
#print(add_remove_fields(rec))


## Run the main routine to update the records

In [12]:
for rec in marc_records:
    rec = marcx.FatRecord.from_record(rec)
    rec = add_remove_fields(rec)
    print(rec)
    print()


=LDR  00000cas a2200000 a 4500
=008  761221d18651865alu\x\ne\\\\\\0\\\a0eng\\
=010  \\$asn 85025905 $z   76640531 
=022  0\$a2328-5869$21
=035  0\$a(OCoLC)2638820
=130  0\$aChattanooga daily rebel (Selma, Ala. : 1865)
=222  \0$aChattanooga daily rebel (Selma, Ala. 1865. Print)
=245  10$aChattanooga daily rebel.
=246  30$aDaily rebel
=260  \\$aSelma, Ala. :$bF.M. Paul
=300  \\$av.
=500  \\$aIncludes 3 issues published between April 19, 1865 and April 27, 1865
=651  \0$aSelma (Ala.)$vNewspapers.
=651  \0$aDallas County (Ala.)$vNewspapers.
=655  \7$aNewspapers.$2fast$0(OCoLC)fst01423814
=651  \7$aAlabama$zDallas County.$2fast$0(OCoLC)fst01207055
=651  \7$aAlabama$zSelma.$2fast$0(OCoLC)fst01216305
=856  40$uhttp://chroniclingamerica.loc.gov/lccn/sn85025905/issues/$zChronicling America


=LDR  00000cas a2200000 a 4500
=008  820925d18621864gaudn\ne\\\\\\0\\\a0eng\\
=010  \\$asn 82015209 
=022  0\$a2328-5826$21
=035  0\$a(OCoLC)8807897
=222  \4$aThe Chattanooga daily rebel$b(Print)
=245  04$a

IndexError: index 0 is out of bounds for axis 0 with size 0

### Check to see if any records lack an 856 field

In [13]:
counter = 0
for rec in marc_records:
    try:

        if not 'chroniclingamerica' in rec['856']['u']:
            counter += 1
            print(counter)
            print(rec)
    except Exception as e:
        counter +=1
        print('***********')
        print('No 856')
        print(counter)
        print(rec)
        print('***********')

***********
No 856
1
=LDR  00000cas a2200000Ma 4500
=001  213806157
=008  840204d1898197uazuwr\nea\\\\\0\\\\0eng\\
=010  \\$asn 87062055 
=130  0\$aCoconino sun (Flagstaff, Ariz. : 1898)
=245  14$aThe Coconino sun$h[microform].
=260  \\$aFlagstaff, Ariz. :$bC.M. Funston
=300  \\$av.
=500  \\$aCeased between 1978 and 1979?
=500  \\$a"Independent." Cf. Ayer, 1979.

***********
2
=LDR  00000cas a2200000 a 4500
=001  36207053
=008  970114d18871890azudr\ne\\\\\\0\\\a0eng\\
=010  \\$asn 96060681 
=022  1\$a2157-801X$21
=130  0\$aTombstone daily epitaph (Tombstone, Ariz. : 1887)
=222  \0$aTombstone daily epitaph$b(Tombstone, Ariz. 1887)
=245  10$aTombstone daily epitaph.
=246  13$aDaily Tombstone epitaph
=246  17$aDaily epitaph
=246  17$aTombstone epitaph
=260  \\$aTombstone, Ariz. :$bJ.O. Dunbar,$c1887-1890.
=300  \\$av.
=651  \0$aTombstone (Ariz.)$vNewspapers.
=651  \0$aCochise County (Ariz.)$vNewspapers.
=651  \7$aArizona$zCochise County.$2fast$0(OCoLC)fst01207405
=651  \7$aArizona$zTombst

## write the records out to a new file

In [15]:
out_processed = codecs.open('newspaper_recs_processed.xml','w','utf-8')
out_processed.write(recs_str)
for rec in marc_records:
    out_processed.write(pymarc.record_to_xml(rec.decode('utf-8') + "\n")
out_processed.write(footer)
out_processed.close()

SyntaxError: invalid syntax (<ipython-input-15-70d0d85930fc>, line 5)

In [16]:
out_processed.write(footer)

In [17]:
out_processed.close()