In [None]:
from bln import Client
from pyquery import PyQuery as pq
import requests
from tqdm import tqdm

import csv
from glob import glob
import datetime
import os
import socket
import sys

In [None]:
timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H%M")

In [None]:
htmldir = "html/"
csvdir = "csv/"

for localdir in [htmldir, csvdir]:
    os.makedirs(localdir, exist_ok=True)

In [None]:
# Find out whether we should be syncing to Big Local News server
def in_production():
    if 'GITHUB_RUN_ID' in os.environ or socket.gethostname() in ["mikelight", "racknerd-26f61a"]:
        return True
    else:
        return False

In [None]:
hosturl = "https://www.gsa.gov/real-estate/real-estate-services/real-property-disposition/noncore-property-list"

In [None]:
r = requests.get(hosturl)
if not r.ok:
    print(f"Failure to download file! What did you do?")
    sys.exit()
else:
    with open(f"{htmldir}noncore_{timestamp}.html", "wb") as outfile:
        outfile.write(r.content)

In [None]:
if in_production:
    bln_api = os.environ["BLN_API_TOKEN"]
    bln = Client(bln_api)
    project = bln.get_project_by_name("GSA non-core properties")
    project_id = project['id']

    # Get all the files in the project.
    archived_files = {}
    for f in project['files']:
        archived_files[f['name']] = f['updatedAt']
    print(f"{len(archived_files):,} archived files found.")

    rawlocalhtmls = glob(htmldir + "*.html")
    additions = []
    localhtmls = []
    for rawlocalhtml in rawlocalhtmls:
        basefilename = rawlocalhtml.replace("\\", "/").replace(htmldir, "")
        localhtmls.append(basefilename)
        if basefilename not in archived_files:
            additions.append(basefilename)
    print(f"{len(additions):,} files need to be archived.")
    for addition in tqdm(additions):
        sourcefilename = htmldir + addition
        bln.upload_file(project_id, sourcefilename)

In [None]:
rawlocalcsvs = glob(csvdir + "*.csv")
localcsvs = []
for rawlocalcsv in rawlocalcsvs:
    localcsvs.append(rawlocalcsv.replace("\\", "/").replace(csvdir, ""))

rawlocalhtmls = glob(htmldir + "*.html")
if in_production():
    to_upload = []

to_convert = {}
for rawlocalhtml in rawlocalhtmls:
    basehtml = rawlocalhtml.replace("\\", "/").replace(htmldir, "")
    basecsv = basehtml.replace(".html", ".csv")
    if basecsv not in localcsvs:
        to_convert[basehtml] = basecsv
    if in_production():
        if basecsv not in archived_files:
            to_upload.append(basecsv)
print(f"{len(to_convert):,} files need to be converted from HTML to CSV.")
if in_production:
    print(f"{len(to_upload)} of those CSVs should be uploaded later.")


In [None]:
for htmlfile in to_convert:
    csvfile = to_convert[htmlfile]
    with open(htmldir + htmlfile, "r", encoding="utf-8") as infile:
        html = infile.read()
    try:
        if "scrollTable_search" in html:
            mastertable = pq(html)("table.scrollTable_search")[0]
        elif "usa-table" in html:
            mastertable = pq(html)("table.usa-table")[0]
        else:
            masterable = None
        
        headerrow = pq(mastertable)("tr")[0]
        headers = []
        for item in pq(headerrow)("th"):
            headers.append(pq(item).text().strip())
    
        masterlist = []
        for row in pq(mastertable)("tr")[1:]:    # Skip header row
            line = {}
            for i in range(0, len(headers)):
                line[headers[i]] = pq(pq(row)("td")[i]).text().strip()
            masterlist.append(line)
    
        with open(csvdir + csvfile, "w", encoding="utf-8", newline="") as outfile:
            print(f"Trying to do {csvfile}") 
            writer = csv.writer(outfile)
            writer.writerow(headers)
            for row in masterlist:
                writer.writerow(list(row.values()))
    except IndexError:
        print(f"Error on {htmlfile}: They broke the table format, or removed it altogether.")

In [None]:
if in_production:
    for csvfile in to_upload:
        bln.upload_file(project_id, csvdir + csvfile)