# National Sea Grant Library (NGSL) catalog metadata scrape

The intent here is to get a feel for how many of the NSGL catalog records for California have an associated PDF. The NGSL is closing it's physical space at the University of Rhode Island. I am trying to determine how much of the physical collection still needs to be digitized, and if I should accession the physical collection for safekeeping.

----

In [None]:
# import libraries we need 

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import urllib.parse
import time
import datetime

# If the Chrome browser is already on your system, you should just need to do
# pip instal selenium
# pip install webdriver-manager
# to install the necessary packages.

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from pandas.io.html import read_html
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from urllib.parse import urlparse
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoSuchWindowException


### This section will save an HTML file for each page of 100 catalog search results. The search was:

Word(s): [“california” and “sea” and “grant”]

which resulted in 6,190 results at 100 results per page.

In [None]:
# This should fire up the headless Chrome browser in another window
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(2) # Crude way to ensure the page has (mostly) loaded before doing anything else.

In [None]:
total_pages = 62
current_page = 1

url = 'https://eos.ucs.uri.edu/EOSWebOPAC/OPAC/Search/AdvancedSearch.aspx?TaskCode=737107&TitleListPageSize=100&CatLevel0Value=&CatLevel1Value=&CatLevel2Value=&CatLevel3Value=&CatLevel4Value='
#grab = requests.get(url)
#soup = BeautifulSoup(grab.text, 'html.parser')
driver.get(url)

for current_page in range(1,total_pages+1):

    grab = driver.page_source

    # Zero-pad the search page number (2 digits minimum) when writing the page source
    with open(f"search_results_page_{current_page:02}.html", "w") as outfile:
        outfile.write(grab)

    next_button = driver.find_element_by_name("ctl00$webopacContentHolder$SearchTitleListControl$titleListNav1$arrowRight")
    if next_button is not None:
        time.sleep(3) # to be polite
        next_button.click()

In [None]:
# Gracefully shut down the headless browser
driver.close()

---
## Now sort through the scraped results pages to obtain an identifier for each item. With that in hand, scrape additional metadata and any associated files for each record. 

---

In [None]:
#io is used for opening and writing files
import io

#glob is used to find all the pathnames matching a specified pattern (here, all text files)
import glob

#os is used to navigate your folder directories (e.g. change folders to where you files are stored)
import os

In [None]:
#Define the file directory here
filedirectory = '/Users/thalassa/github/nsgl/pages-searchResults'

#Change the working directory to the one you just defined
os.chdir(filedirectory)

In [None]:
print(os.listdir(filedirectory))

In [None]:
#Sort all the files in the directory you specified above, alphabetically.

start = datetime.datetime.utcnow()

#For each of those files...
for filename in sorted(os.listdir(filedirectory)):
    #If the filename ends with .html (i.e. if it's actually a text files)
    if filename.endswith('.html'):
        #The file name of the output file adds _data to the end of the file name of the input file
        outfilename = filename.replace('.html', '_data.txt')
        #Open the infput filename
        with open(filename, 'r') as f:
            #Create and open the output filename
            with open(outfilename, 'w') as out:
                soup = BeautifulSoup(f, "html.parser")
                records = str(soup.find_all(href=re.compile("javascript:ViewNewCompleteDisplayRecord")))
                ids = re.findall(r'[A-Z]*\|[0-9]*\|[0-9]*\|[0-9]*',records)
                for element in ids:
                    out.write(element + "\n")
                out.close()
                
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes.")


In [None]:
# merge all of the recrods ids into one file
read_files = glob.glob("*.txt")

with open("all_ids.txt", "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())

## Now we want to parse the IDs to get the information we need to build the URLs to fetch the data

E.g., EOSMAIN|5393927|10|2696314

Is a link like this: https://eos.ucs.uri.edu/EOSWebOPAC/OPAC/Details/Record.aspx?BibCode=EOSMAIN%7C5393927%7C10%7C2696314

In [None]:
#Define the file directory here
filedirectory = '/Users/thalassa/github/nsgl'

#Change the working directory to the one you just defined
os.chdir(filedirectory)

# Open the text file with the item IDs in it, create a list
f = open("seagrant-ids.txt",'r')
ids = f.read().split("\n")
print(ids[0:10])

In [None]:
# Open text file and create a dataframe
data = pd.read_csv("seagrant-ids.txt",sep="|",names=["eosmain","id1","id2","id3"])
data.head

In [None]:
# create a list of URLs with the ID info as another column in the 'data' dataframe
data['url'] = 'https://eos.ucs.uri.edu/EOSWebOPAC/OPAC/Details/Record.aspx?BibCode=EOSMAIN%7C' + data['id1'].astype(str) + '%7C' + data['id2'].astype(str) + '%7C' + data['id3'].astype(str)

In [None]:
# Show one URL 
data['url'][0]

## Now we can launch the ChromeDriver, use the links we generated in the previous steps to visit each catalog record, scrape the catalog metadata and PDF files from the records.

The steps are:
- load catalog record from list of URLs
- find metadata table, read it, save to CSV file
- see if PDF links exist
- grab pdf links, iterate through and download each one

In [None]:
# This should fire up the headless Chrome browser in another window
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(2) # Crude way to ensure the page has (mostly) loaded before doing anything else.

In [None]:
url = data['url'][2206]
driver.get(url)
canclick = driver.find_element_by_xpath('//*[@id="MediaListRepeater_ctl00_MediaHyperLink"]').get_attribute('onclick')
print(canclick)

In [None]:
# Loop through number of URLs in data DataFrame
total_items = len(data)
current_item = 3564

#set up filenames for CSV files
outfilename = data['id1'].astype(str) + "_" + data['id2'].astype(str) + "_" + data['id3'].astype(str) + ".csv"

for current_item in range(current_item,total_items+1):
    item_url = data['url'][current_item]

    driver.get(item_url)
    
    try: 
        table = driver.find_element_by_xpath('//*[@id="wrapper"]/div[2]/div/div/table/tbody/tr/td[2]')
    except NoSuchElementException as exception:
        # pass
        # print("Element Exception Skipped")
        continue 
        
    table_html = table.get_attribute('innerHTML')
    df = read_html(table_html)[0]
        
    # Save metadata to a CSV file, only the relevant columns
    df.to_csv(outfilename[current_item],',',columns=[1,2],header=["MetadataField","Metadata"])


In [27]:
print(current_item)
#print(data[277])
print(data['url'][current_item])


6200


KeyError: 6200

In [None]:
(move up into previous code chunk if you want to download the PDFs again
# Download PDF(s) if available

    canclick = driver.find_element_by_xpath('//*[@id="MediaListRepeater_ctl00_MediaHyperLink"]').get_attribute('onclick')
    if (canclick=="TrackMediaLinkUsage"):



<a href="/seagrant_Linked_Documents/oresu/WR-15-001%20Edwards%20(Cone)%20R-S-18-PD%20(poster)%20.pdf" id="MediaListRepeater_ctl00_MediaHyperLink" class="trigger" title="" target="_blank" onclick="TrackMediaLinkUsage('2','view PDF','0','10593614','10593699','1' );" mediacode="10593699"><img src="/EOSWebOPAC/Images/mediatype90X90-2.png" id="MediaListRepeater_ctl00_ImageHolder" onmouseout="ViewDetailMouseOut(this)" width="90" data-original="" class="loading90 thumbNailImages lazy" height="90" onmouseover="ViewDetail('/EOSWebOPAC/Images/mediatype350X350-2.png','view PDF','',' ','0',$(this),10593699)" style="z-index:1;" alt="view PDF"></a>



    # pdf_relative_link is
    # '/SEAGRANT_Linked_Documents/scu/USC%20Sea%20Grant%20Newsletter_%c2%a0April%202020.pdf'
    grab = driver.page_source
    pdf_relative_link = re.findall("seagrant_Linked_Documents\S*.pdf",grab)
    pdflink = 'https://eos.ucs.uri.edu/' + pdf_relative_link
    pdf_filename = pdf_relative_link.split('/')[-1]
    with open(pdf_filename, 'wb') as f:
        f.write(response.content)
        f.close()


In [None]:
current_item+1

In [None]:
# pdf_relative_link is
# '/SEAGRANT_Linked_Documents/scu/USC%20Sea%20Grant%20Newsletter_%c2%a0April%202020.pdf'
grab = driver.page_source
pdf_relative_link = re.findall("seagrant_Linked_Documents\S*.pdf",grab)
pdflink = 'https://eos.ucs.uri.edu/' + pdf_relative_link
pdf_filename = pdf_relative_link.split('/')[-1]
with open(pdf_filename, 'wb') as f:
    f.write(response.content)
    f.close()

print("File ", i, " downloaded")

In [None]:
# Get URL for PDF from Media box on page (if exists)
pdf_url = driver.find_element_by_xpath('//*[@id="MediaListRepeater_ctl00_MediaHyperLink"]').get_attribute('href')
response = requests.get(pdf_url)
i = 1
j = 0

# Write content in pdf file
pdf = open(df.loc[j,2] + "_" + str(i) + ".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")

In [None]:
url = 'https://eos.ucs.uri.edu/EOSWebOPAC/OPAC/Details/Record.aspx?BibCode=EOSMAIN%7C17364449%7C24990%7C2689066'
driver.get(url)


In [None]:
# Now we can loop through all the URLs and scrape the catalog data.
# Create object page 
r = requests.get(url.format())

#page = requests.get(data['url'][0])

# Obtain page's information
soup = BeautifulSoup(r.content, "html.parser")
for link in soup.select("#MediaLinksSection"):
    r = requests.get(link.get("href"), stream=True)
    r.raw.decode_content = True
    with open(link.text+'.pdf', 'wb') as f:
        shutil.copyfileobj(r.raw, f)
#print(soup.prettify())

----

# The following section does not work b/c the NSGL catalog has fatal design errors.
## Some catalog records have page load errors, and I have abandoned trying to scrape catalog records using this method. I am leaving this script for reference. 
-------
### This section will save an HTML file for each catalog item page that is a result of a Search. The search was:

Word(s): [“california” and “sea” and “grant”]

which resulted in 6,190 results at 100 results per page.

In [None]:
# This should fire up the headless Chrome browser in another window
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(2) # Crude way to ensure the page has (mostly) loaded before doing anything else.

Now we need to click through all the items for the first 100 search results, then click on the "next page" arrow, then resume clicking through items until we hit item 200, then click "next page", and so on until we've gone through 62 pages of results (100 items per page). 

In [None]:
# I pulled this chunk of text out to another cell so that I can start on whatever "current_item" I need to (due to errors in the catalog, etc.) 
total_items = 6109
current_item = 1

# Search results URL vvv
# NOTE this URL may not work. In that case, go to "Search the Catalog" and do a Word(s) search for “california and sea and grant”. Then you can run this cell and it will work.
url = 'https://eos.ucs.uri.edu/EOSWebOPAC/OPAC/Search/AdvancedSearch.aspx?TaskCode=739469&TitleListPageSize=100&CatLevel0Value=&CatLevel1Value=&CatLevel2Value=&CatLevel3Value=&CatLevel4Value='
driver.get(url)

# click on first record in the list of 100 results on first page. After this you will be clicking through item pages. 
# Comment out this line if you are restarting loop in the middle somewhere. You must be on the record URL you want to start with. 
first_record = driver.find_element_by_xpath('//*[@id="ctl00_webopacContentHolder_SearchTitleListControl_MainRepeater_ctl01_DetailRepeater_ctl01_DetailRow"]/td/a').click()


In [None]:
# This loops through 6109 results, one item at a time. There is logic for what to do when you hit the bottom of a results page.
for current_item in range(current_item,total_items+1):
    
    grab = driver.page_source

    # Zero-pad the search page number (4 digits minimum) when writing the page source
    with open(f"search_results_item_{current_item:04}.html", "w") as outfile:
        outfile.write(grab)
        
    # click the "down arrow" to go to the next record in the list of 100 results on this page.
    # We need to check to see if the down arrow will function - it won't on the last record.
    # Outputs of this attribute check will return "GetNextTitle();" when there are more results to show, 
    # or "return false;" when the button does not work (at record 100 on the page)
        canclick = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_webopacContentHolder_RecordPager_imgGetNext"]').get_attribute('onclick')
    if (canclick=="GetNextTitle();"):
        
        # click the "down arrow" to go to the next record in the list of 100 results on this page.
        next_record = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_webopacContentHolder_RecordPager_imgGetNext"]')
        time.sleep(3) # to be polite
        next_record.click()   
    
    # After the scraper gets to record 100, we need to turn the page by doing a few clicks.
    else:
        # click the "Records" drop-down:
        driver.find_element_by_xpath('//*[@id="ctl00_ctl00_webopacContentHolder_RecordPager_imgExpandNav"]').click()

        # and then click the "next" button:
        driver.find_element_by_xpath('//*[@id="ctl00_ctl00_webopacContentHolder_RecordPager_NextPage"]').click()

        # and then click the first item in the drop-down list, which I think always has the same ID:
        driver.find_element_by_xpath('//*[@id="ctl00_ctl00_webopacContentHolder_RecordPager_RecordRepeater_ctl01_ItemAnchor"]').click()

        continue # to the top of the loop for the 100 records on the page


## Literally everything below here is me randomly trying sh*t to see what sticks.

In [None]:
url = 'https://eos.ucs.uri.edu/EOSWebOPAC/OPAC/Search/AdvancedSearch.aspx?TaskCode=737107&TitleListPageSize=100&CatLevel0Value=&CatLevel1Value=&CatLevel2Value=&CatLevel3Value=&CatLevel4Value='
grab = requests.get(url)
soup = BeautifulSoup(grab.text, 'html.parser') # parse text from the html

In [None]:
TABLE_SELECTOR = "table.TableCellPadding2Px:nth-child(3) > tbody:nth-child(1)"

#ctl00_webopacContentHolder_SearchTitleListControl_titleListNav1_arrowRight
document.querySelector("#ctl00_webopacContentHolder_SearchTitleListControl_titleListNav1_arrowRight") 

#ctl00_webopacContentHolder_SearchTitleListControl_MainRepeater_ctl01_DetailColumn > table:nth-child(1)

#ctl00_webopacContentHolder_SearchTitleListControl_MainRepeater_ctl01_DetailColumn > table:nth-child(1) > tbody:nth-child(1)

#ctl00_webopacContentHolder_SearchTitleListControl_MainRepeater_ctl01_DetailRepeater_ctl01_DetailRow 

In [None]:
table_test = soup.find_all('table', class_='DefaultTable')

In [None]:
print(table_test)

In [None]:
table_test = soup.find_all('tr', class_='HorDisplayAltRow')

In [None]:
test = soup.find_all('tbody')

In [None]:
print(test)

In [None]:
print(soup.re_compile("seagrant_Linked_Documents"))

In [None]:
urls = []
for link in soup.find_all('a'):
    print(link.get('href'))

In [None]:
# opening a file in write mode
f = open("test1.txt", "w")
# traverse paragraphs from soup
for link in soup.find_all("a"):
   data = link.get('href')
   f.write(data)
   f.write("\n")
 
f.close()

In [None]:
# traverse paragraphs from soup
for link in soup.find_all("a"):
   data = link.get('href')
   print(data)
   