In [55]:
from lxml import html
from urllib.request import urlretrieve
from urllib.error import URLError
import os
import json
from os import path

DATA_DIR = "../../Data"

In [56]:
def remove_prefix(text, prefix):
    '''Remove a prefix from a string, returning a new string.'''
    if text.startswith(prefix):
        return text[len(prefix):]
    return text

def remove_suffix(text, suffix):
    '''Remove a suffix from a string, returning a new string.'''
    if text.endswith(suffix):
        return text[:-len(suffix)]
    return text

In [57]:
with open(path.join(DATA_DIR, 'Initial_CFG_HTML_Query_20180426.html'), encoding='windows-1252') as f:
    html_data = f.read()

In [58]:
tree = html.fromstring(html_data)

In [59]:
results_table = tree.find_class("tips")[0].find_class("data")

In [53]:
# Perform some HTML scraping to pull out key parameters and urls from saved HTML search.
data_list = []
for element in results_table:
    sample = element.find_class('webSiteBody')[0][0].text
    try:
        sample_html = element.find_class('webSiteBody')[0][0].attrib['href']
        sample_html = remove_prefix(sample_html, "javascript:openWindow('")
        sample_html = remove_suffix(sample_html, "')")
    except KeyError:
        try:
            sample_html = element.find_class('webSiteBody')[0][-1].attrib['href']
            sample_html = remove_prefix(sample_html, "javascript:openWindow('")
            sample_html = remove_suffix(sample_html, "')")
        except:
            sample_html = None
    species = element.find_class('webSiteBody')[1].text
    protein_family = element.find_class('webSiteBody')[2].text
    investigator = element.find_class('webSiteBody')[3].text_content()
    experiment = element.find_class('websiteBodyLight')[0].text
    try:
        data_url = element.find_class('webSiteBody')[5][0][0][0][1][0].attrib['href']
        data_url = remove_prefix(data_url, "javascript:openWindow('")
        data_url = remove_suffix(data_url, "')")
    except IndexError:
        data_url = None
    for a_href in element.iter("a"):
        if 'primscreen' in a_href.attrib['href']:
            primscreen_id = a_href.attrib['href'].split('primscreen_')[1][0:-2]
    data_list.append({'sample': sample, 'sample_html': sample_html, 'species': species, 'protein_family': protein_family,
                      'experiment': experiment, 'data_url': data_url, 'data_file': None, 'investigator': investigator, 'primscreen_id': primscreen_id})

In [8]:
# Retrieve all data files and save them locally
errors = []
for i, datum in enumerate(data_list):
    if not datum['data_url']:
        continue
    try:
        response = urlretrieve(datum['data_url'])
    except URLError:
        # Can come back and fix any errors if we need to.
        errors.append(i)
        continue
    filename = response[1].get_filename()
    os.rename(response[0], path.join(DATA_DIR, 'CFG_Data_Files', filename))
    datum['data_file'] = filename

# There shouldn't be any errors here, but perhaps if there are connection issues there might be some.
# Can always rerun these if we need to.
print(errors)

In [71]:
# Save list of all data entries and associated file names.
with open(path.join(DATA_DIR,'Data_Index.json'), 'w') as f:
    json.dump(data_list, f, sort_keys=True, indent=4)