# Program to analyze all documents downloaded to TDar in 2019

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re
import csv

### Prepare functions

In [34]:
def make_soup(url):
    '''function to get page content and transform it into bs object'''
    r = requests.get(url)
    soup = bs(r.text, "lxml")
    return soup

In [36]:
# Make first page soup
soup = make_soup("https://core.tdar.org/scholar/scholar?year=2019")

In [5]:
# One page test
box = soup.find("a", class_="resourceLink")
link = "https://core.tdar.org/"+box["href"]
project_html = requests.get(link)
project_soup = bs(project_html.text, "lxml")

In [32]:
def get_keywords(keyword_type):
    '''function to get keywords from the page
    input: keyword type
    output: contents of the list of names'''
    keywords = []
    #find correct paragraph
    for paragraph in project_soup.find_all("p", class_="break-word"):
        if paragraph.find("strong").text == keyword_type:
            box = paragraph
    #display categories text
            for item in box.find_all("a", href=True):
                keywords.append(item.text.strip())
            break
    return ", ".join(keywords)

In [10]:
def get_sidebarInfo(tag):
    '''function to get information from the sidebar
    input: sidebar tag name
    output: tag content'''
    sidebar = project_soup.find("div", {"id": "sidebar-right"})
    br_tag = sidebar.find("strong", text=tag).nextSibling
    sidebarInfo = br_tag.nextSibling
    return sidebarInfo

### Start parsing

In [61]:
# Open the csv document and name columns
tdar_data = open("TdarData.csv", "w")
csv_writer = csv.writer(tdar_data)
csv_writer.writerow(["Title", "URL", "Year", "Investigation Type", "Geographic Keywords", "Culture", "Language"])
# Extract page links
for title in soup.find_all("a", class_="resourceLink"):
    title_links = "https://core.tdar.org/"+title["href"]
    # Get project titles
    title_text = title.text
    print(title_text)
    print(title_links)
    # Get year of the project from title
    year_pattern = re.compile("(\d{4}\)$)")
    try:
        year_find = re.search(year_pattern, title_text)
        year_result = year_find.group()[:-1]
        print(year_result)
    except AttributeError:
        pass
    project_soup = make_soup(title_links)
    try:
        inv_type = get_keywords("Investigation Types  ")
    except AttributeError:
        inv_type = None
    print(inv_type)
    try:
        geography = get_keywords("Geographic Keywords  ")
        print(geography)
    except AttributeError:
        geography = None
    try:
        culture = get_keywords("Culture  ")
        print(culture)
    except AttributeError:
        culture = None
    try:
        language = get_sidebarInfo("Language")
        print(language)
    except AttributeError:
        language = None
    # Write data into the document
    csv_writer.writerow([title_text, title_links, year_result, inv_type, geography, culture, language])
#close document
tdar_data.close()

            Archaeological Survey and Test Excavations in the Burnsville Reservoir 1971-1975             

https://core.tdar.org//project/447952/archaeological-survey-and-test-excavations-in-the-burnsville-reservoir-1971-1975
['Heritage Management', 'Reconnaissance / Survey']
['Braxton (County)', 'Burnsville Reservoir', 'West Virginia (State / Territory)']
[]
            Archaeological Survey of Beech Fork Lake 1973-1975             

https://core.tdar.org//project/447998/archaeological-survey-of-beech-fork-lake-1973-1975
['Archaeological Overview', 'Data Recovery / Excavation', 'Heritage Management', 'Reconnaissance / Survey']
['Beech Fork Lake', 'Cabell (County)', 'Lavalette', 'Wayne (County)']
[]
            Archaeological Survey of East Lynn Reservoir 1964-1965             

https://core.tdar.org//project/448003/archaeological-survey-of-east-lynn-reservoir-1964-1965
['Heritage Management', 'Reconnaissance / Survey']
['East Lynn Reservoir', 'Wayne (County)', 'West Virginia (State / 

['Archaeological Overview', 'Consultation', 'Environment Research', 'Historic Background Research', 'Methodology, Theory, or Synthesis', 'Reconnaissance / Survey', 'Records Search / Inventory Checking', 'Site Evaluation / Testing']
['Southwest Florida Water Management District, Lower Hillsborough River Flood Detention Area, Tampa Bay, Hillsbrough Coutny']
['Archaic', 'Late Archaic', 'Middle Archaic', 'Mississippian', 'Sites cover about 7,000 or so of prehistory in the Tampa Bay region', 'Woodland']

            English
            
            An experimental comparison of the east Asian, Hellenistic and Indian (Gandharan) stills in relation to the distillation of ethanol and acetic acid             (1980)

https://core.tdar.org//document/452605/an-experimental-comparison-of-the-east-asian-hellenistic-and-indian-gandharan-stills-in-relation-to-the-distillation-of-ethanol-and-acetic-acid
1980
[]
[]
[]

            English
            
            Test Excavations at 8Hi450D: An Inland A

KeyboardInterrupt: 

In [None]:
+title
+year of event
+investigation type 
+geography
language
+culture