# Appendix III: Webscraping the Library of Congress Catalog

This code uses the emailed catalog records, saved as text files, to retrieve the permalinks for the Library of Congress catalog entries. The permalinks are then used to obtain genre/form descriptions from the Library of Congress catalog, looping over all included permalinks. 

In [1]:
import re
import bs4 #this may be appendices only
import requests #this may be appendices only
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime
import datetime
import seaborn as sns
from   sklearn.linear_model import LinearRegression, LogisticRegression
from   sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

In [2]:
#Get LOC Permalinks from email text
#ISBNs entered into the advanced search in LC Catalog in groups of 85 - 170. Results can be emailed/exported in batches of 100
#most searches resulted in less than 150 records
#text formatted results from each export were copied into a single text document for a round of searching, and then the text processed to retrieve the permalinks to the LC records
#Multiple rounds of searches were run. The first round resulted in approximately 450 non-successes, the second in slightly less than 100 non-successes, and the third resulted in only 80 unmatched records.

loc_email=open("LOC Email Data.txt",encoding='utf8')
print(loc_email)
link_lst1 = []

for ln in loc_email:
    if ln.lstrip().startswith('Permalink'): #find permalink rows of text
        link_lst1.append(ln.lstrip().rstrip('\n')[12:]) #remove "Permalink: " from the string, retain only the actual link

loc_email.close()

loc_email=open("LOC Email Data2.txt",encoding='utf8')
print(loc_email)
link_lst2 = []

for ln in loc_email:
    if ln.lstrip().startswith('Permalink'):
        link_lst2.append(ln.lstrip().rstrip('\n')[12:])

loc_email.close()

loc_email=open("LOC Email Data3.txt",encoding='utf8')
print(loc_email)
link_lst3 = []

for ln in loc_email:
    if ln.lstrip().startswith('Permalink'):
        link_lst3.append(ln.lstrip().rstrip('\n')[12:])
loc_email.close()

link_lst=list(set(link_lst1 + link_lst2 + link_lst3))#concatenate all lists of links into a single list to loop over for webscraping.
len(link_lst)

<_io.TextIOWrapper name='LOC Email Data.txt' mode='r' encoding='utf8'>
<_io.TextIOWrapper name='LOC Email Data2.txt' mode='r' encoding='utf8'>
<_io.TextIOWrapper name='LOC Email Data3.txt' mode='r' encoding='utf8'>


1003

In [None]:
outer_loop_results = []
link_lst= list(set(link_lst)) #remove duplicate permalinks, as each search round may have resulted in overlaps
k_range = len(link_lst)
for k in range(0,k_range):
    loc_req = requests.get(link_lst[k])
    soup_hold = bs4.BeautifulSoup(loc_req.text,'html.parser')
    item_title_hold=soup_hold.findAll('h3',attrs={'class':'item-title'}) #class names are very limited on the LC catalog pages, which offers flexibility in webpage creation, but makes specifying a particular item title difficult.
    title_genre_list=[] #the form/genres associated with an LC record can be an n-dimensional array 
    results_holder = [] #not used in the final version of this loop, temporary step in loop development
    for i in range(0,len(item_title_hold)):
        item_title_i = item_title_hold[i]
        if item_title_i.string=='ISBN': #find the ISBN item-title
            parent_div = item_title_i.parent #look at all parts in the associated div - due to webpage structure, the actual value of the ISBN isn't stored with the ISBN title
            isbn_field = parent_div.h3.next_sibling.next_sibling.li.span.string[0:13] #two siblings below the ISBN is the <span> tag that contains the ISBNs associated with the record. This code chooses the first ISBN and assumes it is an ISBN 13
        if item_title_i.string == 'Form/Genre':
            parent_div = item_title_i.parent
            for j in parent_div.findAll('span'):
                title_genre_list.append(j.string)
            results_holder.append((isbn_field,title_genre_list)) #this will be emptied out for each iteration of the outer loop. 
    outer_loop_results.append((link_lst[k],isbn_field,title_genre_list))
print('Done')
loc_results = pd.DataFrame(outer_loop_results, columns = ['permalink','isbn','genres'])
loc_results.head()

#Find cases where the ISBN 13 assumption is invalid by identifying cases where a non-numeric character is in the isbn field

loc_results = pd.DataFrame(outer_loop_results, columns = ['permalink','isbn','genres'])
loc_results.head()
loc_results.isbn.astype(str)
loc_results[loc_results.isbn.str.isdigit()==False]

loc_results.to_csv('C:/Users/Aryn/Documents/GitHub/INFO2950/Phase V/loc_results_v2.csv')