# Program to analyze all documents downloaded to TDar in 2019

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re
import csv

### Prepare functions

In [2]:
def make_soup(url):
    '''function to get page content and transform it into bs object'''
    r = requests.get(url)
    soup = bs(r.text, "lxml")
    return soup

In [3]:
# Make first page soup
soup = make_soup("https://core.tdar.org/scholar/scholar?year=2019")

In [5]:
# One page test
box = soup.find("a", class_="resourceLink")
link = "https://core.tdar.org/"+box["href"]
project_html = requests.get(link)
project_soup = bs(project_html.text, "lxml")

In [4]:
def get_keywords(keyword_type):
    '''function to get keywords from the page
    input: keyword type
    output: contents of the list of names'''
    keywords = []
    #find correct paragraph
    for paragraph in project_soup.find_all("p", class_="break-word"):
        if paragraph.find("strong").text == keyword_type:
            box = paragraph
    #display categories text
            for item in box.find_all("a", href=True):
                keywords.append(item.text.strip())
            break
    return ", ".join(keywords)

In [5]:
def get_sidebarInfo(tag):
    '''function to get information from the sidebar
    input: sidebar tag name
    output: tag content'''
    sidebar = project_soup.find("div", {"id": "sidebar-right"})
    br_tag = sidebar.find("strong", text=tag).nextSibling
    sidebarInfo = br_tag.nextSibling
    return sidebarInfo

### Start parsing

In [None]:
# Open the csv document and name columns
tdar_data = open("TdarData.csv", "w")
csv_writer = csv.writer(tdar_data)
csv_writer.writerow(["Title", "URL", "Year", "Investigation Type", "Geographic Keywords", "Culture", "Language"])
# Extract page links
for title in soup.find_all("a", class_="resourceLink"):
    title_links = "https://core.tdar.org/"+title["href"]
    # Get project titles
    title_text = title.text
    # Get year of the project from title
    year_pattern = re.compile("(\d{4}\)$)")
    try:
        year_find = re.search(year_pattern, title_text)
        year_result = year_find.group()[:-1]
    except AttributeError:
        year_result = None
    project_soup = make_soup(title_links)
    try:
        inv_type = get_keywords("Investigation Types  ")
    except AttributeError:
        inv_type = None
    try:
        geography = get_keywords("Geographic Keywords  ")
    except AttributeError:
        geography = None
    try:
        culture = get_keywords("Culture  ")
    except AttributeError:
        culture = None
    try:
        language = get_sidebarInfo("Language")
    except AttributeError:
        language = None
    # Write data into the document
    csv_writer.writerow([title_text, title_links, year_result, inv_type, geography, culture, language])
#close document
tdar_data.close()